From 728a3f3ec163aa6d05085446ed9557aab2144cf4 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Tue, 18 Oct 2022 09:14:30 +0200
Subject: [PATCH 001/269] Rename StableDiffusionOnnxPipeline ->
 OnnxStableDiffusionPipeline (#887)

Rename and deprecate
---
 src/diffusers/__init__.py                     |  2 +-
 src/diffusers/pipelines/__init__.py           |  2 +-
 .../pipelines/stable_diffusion/__init__.py    |  2 +-
 ...x.py => pipeline_onnx_stable_diffusion.py} | 28 +++++++++++++++++--
 ...torch_and_transformers_and_onnx_objects.py | 15 ++++++++++
 tests/test_pipelines.py                       |  6 ++--
 6 files changed, 47 insertions(+), 8 deletions(-)
 rename src/diffusers/pipelines/stable_diffusion/{pipeline_stable_diffusion_onnx.py => pipeline_onnx_stable_diffusion.py} (89%)

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 0a0b0b4965dd..0041030c2666 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -58,7 +58,7 @@
     from .utils.dummy_torch_and_transformers_objects import *  # noqa F403
 
 if is_torch_available() and is_transformers_available() and is_onnx_available():
-    from .pipelines import StableDiffusionOnnxPipeline
+    from .pipelines import OnnxStableDiffusionPipeline, StableDiffusionOnnxPipeline
 else:
     from .utils.dummy_torch_and_transformers_and_onnx_objects import *  # noqa F403
 
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 1c31595fb0cf..01391b0db362 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -20,7 +20,7 @@
     )
 
 if is_transformers_available() and is_onnx_available():
-    from .stable_diffusion import StableDiffusionOnnxPipeline
+    from .stable_diffusion import OnnxStableDiffusionPipeline, StableDiffusionOnnxPipeline
 
 if is_transformers_available() and is_flax_available():
     from .stable_diffusion import FlaxStableDiffusionPipeline
diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
index 8c07afe58fc2..b1f3240f3ee3 100644
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -34,7 +34,7 @@ class StableDiffusionPipelineOutput(BaseOutput):
     from .safety_checker import StableDiffusionSafetyChecker
 
 if is_transformers_available() and is_onnx_available():
-    from .pipeline_stable_diffusion_onnx import StableDiffusionOnnxPipeline
+    from .pipeline_onnx_stable_diffusion import OnnxStableDiffusionPipeline, StableDiffusionOnnxPipeline
 
 if is_transformers_available() and is_flax_available():
     import flax
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_onnx.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
similarity index 89%
rename from src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_onnx.py
rename to src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
index acd6446af6fe..91bf5012362e 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_onnx.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
@@ -8,14 +8,14 @@
 from ...onnx_utils import OnnxRuntimeModel
 from ...pipeline_utils import DiffusionPipeline
 from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from ...utils import logging
+from ...utils import deprecate, logging
 from . import StableDiffusionPipelineOutput
 
 
 logger = logging.get_logger(__name__)
 
 
-class StableDiffusionOnnxPipeline(DiffusionPipeline):
+class OnnxStableDiffusionPipeline(DiffusionPipeline):
     vae_decoder: OnnxRuntimeModel
     text_encoder: OnnxRuntimeModel
     tokenizer: CLIPTokenizer
@@ -198,3 +198,27 @@ def __call__(
             return (image, has_nsfw_concept)
 
         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+
+class StableDiffusionOnnxPipeline(OnnxStableDiffusionPipeline):
+    def __init__(
+        self,
+        vae_decoder: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: OnnxRuntimeModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: OnnxRuntimeModel,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        deprecation_message = "Please use `OnnxStableDiffusionPipeline` instead of `StableDiffusionOnnxPipeline`."
+        deprecate("StableDiffusionOnnxPipeline", "1.0.0", deprecation_message)
+        super().__init__(
+            vae_decoder=vae_decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
index d099b837296a..72ca97f51450 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
@@ -4,6 +4,21 @@
 from ..utils import DummyObject, requires_backends
 
 
+class OnnxStableDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "onnx"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+
 class StableDiffusionOnnxPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers", "onnx"]
 
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 69a45c4247aa..659e69554b94 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -37,13 +37,13 @@
     LDMPipeline,
     LDMTextToImagePipeline,
     LMSDiscreteScheduler,
+    OnnxStableDiffusionPipeline,
     PNDMPipeline,
     PNDMScheduler,
     ScoreSdeVePipeline,
     ScoreSdeVeScheduler,
     StableDiffusionImg2ImgPipeline,
     StableDiffusionInpaintPipeline,
-    StableDiffusionOnnxPipeline,
     StableDiffusionPipeline,
     UNet2DConditionModel,
     UNet2DModel,
@@ -2010,7 +2010,7 @@ def test_stable_diffusion_inpaint_pipeline_k_lms(self):
 
     @slow
     def test_stable_diffusion_onnx(self):
-        sd_pipe = StableDiffusionOnnxPipeline.from_pretrained(
+        sd_pipe = OnnxStableDiffusionPipeline.from_pretrained(
             "CompVis/stable-diffusion-v1-4", revision="onnx", provider="CPUExecutionProvider"
         )
 
@@ -2214,7 +2214,7 @@ def test_callback_fn(step: int, timestep: int, latents: np.ndarray) -> None:
 
         test_callback_fn.has_been_called = False
 
-        pipe = StableDiffusionOnnxPipeline.from_pretrained(
+        pipe = OnnxStableDiffusionPipeline.from_pretrained(
             "CompVis/stable-diffusion-v1-4", revision="onnx", provider="CPUExecutionProvider"
         )
         pipe.set_progress_bar_config(disable=None)

From a3efa433eac5feba842350c38a1db29244963fb5 Mon Sep 17 00:00:00 2001
From: Hamish Friedlander <hafriedlander@gmail.com>
Date: Tue, 18 Oct 2022 23:06:46 +1300
Subject: [PATCH 002/269] Fix DDIM on Windows not using int64 for timesteps
 (#819)

---
 src/diffusers/schedulers/scheduling_ddim.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 0f1a40229475..612c24f807a1 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -149,7 +149,7 @@ def __init__(
 
         # setable values
         self.num_inference_steps = None
-        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
 
     def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
         """
@@ -192,7 +192,7 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         step_ratio = self.config.num_train_timesteps // self.num_inference_steps
         # creates integer timesteps by multiplying by ratio
         # casting to int to avoid issues when num_inference_step is power of 3
-        timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy()
+        timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
         self.timesteps = torch.from_numpy(timesteps).to(device)
         self.timesteps += offset
 

From fbe807bf57cd64c1a1a37751e5ced13b0e4a262c Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Tue, 18 Oct 2022 17:28:51 +0200
Subject: [PATCH 003/269] [dreambooth] allow fine-tuning text encoder (#883)

* allow fine-tuning text encoder

* fix a few things

* update readme
---
 examples/dreambooth/README.md           | 33 +++++++++++++++
 examples/dreambooth/train_dreambooth.py | 55 +++++++++++++++++++------
 2 files changed, 76 insertions(+), 12 deletions(-)

diff --git a/examples/dreambooth/README.md b/examples/dreambooth/README.md
index 7094570ad5eb..62ab1b88c0cb 100644
--- a/examples/dreambooth/README.md
+++ b/examples/dreambooth/README.md
@@ -160,6 +160,39 @@ accelerate launch train_dreambooth.py \
   --mixed_precision=fp16
 ```
 
+### Fine-tune text encoder with the UNet.
+
+The script also allows to fine-tune the `text_encoder` along with the `unet`. It's been observed experimentally that fine-tuning `text_encoder` gives much better results especially on faces. 
+Pass the `--train_text_encoder` argument to the script to enable training `text_encoder`.
+
+___Note: Training text encoder requires more memory, with this option the training won't fit on 16GB GPU. It needs at least 24GB VRAM.___
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --train_text_encoder \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --use_8bit_adam
+  --gradient_checkpointing \
+  --learning_rate=2e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
+
 ## Inference
 
 Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `identifier`(e.g. sks in above example) in your prompt.
diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index ca39aeff236b..40d5bca45de9 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -1,4 +1,5 @@
 import argparse
+import itertools
 import math
 import os
 from pathlib import Path
@@ -100,6 +101,7 @@ def parse_args():
     parser.add_argument(
         "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
     )
+    parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder")
     parser.add_argument(
         "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
     )
@@ -320,6 +322,15 @@ def main():
         logging_dir=logging_dir,
     )
 
+    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
+    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
+    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
+        raise ValueError(
+            "Gradient accumulation is not supported when training the text encoder in distributed training. "
+            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
+        )
+
     if args.seed is not None:
         set_seed(args.seed)
 
@@ -385,8 +396,14 @@ def main():
     vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
     unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
 
+    vae.requires_grad_(False)
+    if not args.train_text_encoder:
+        text_encoder.requires_grad_(False)
+
     if args.gradient_checkpointing:
         unet.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder.gradient_checkpointing_enable()
 
     if args.scale_lr:
         args.learning_rate = (
@@ -406,8 +423,11 @@ def main():
     else:
         optimizer_class = torch.optim.AdamW
 
+    params_to_optimize = (
+        itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters()
+    )
     optimizer = optimizer_class(
-        unet.parameters(),  # only optimize unet
+        params_to_optimize,
         lr=args.learning_rate,
         betas=(args.adam_beta1, args.adam_beta2),
         weight_decay=args.adam_weight_decay,
@@ -467,9 +487,14 @@ def collate_fn(examples):
         num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
     )
 
-    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-        unet, optimizer, train_dataloader, lr_scheduler
-    )
+    if args.train_text_encoder:
+        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
 
     weight_dtype = torch.float32
     if args.mixed_precision == "fp16":
@@ -480,8 +505,9 @@ def collate_fn(examples):
     # Move text_encode and vae to gpu.
     # For mixed precision training we cast the text_encoder and vae weights to half-precision
     # as these models are only used for inference, keeping weights in full precision is not required.
-    text_encoder.to(accelerator.device, dtype=weight_dtype)
     vae.to(accelerator.device, dtype=weight_dtype)
+    if not args.train_text_encoder:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
 
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
@@ -516,9 +542,8 @@ def collate_fn(examples):
         for step, batch in enumerate(train_dataloader):
             with accelerator.accumulate(unet):
                 # Convert images to latent space
-                with torch.no_grad():
-                    latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
-                    latents = latents * 0.18215
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                latents = latents * 0.18215
 
                 # Sample noise that we'll add to the latents
                 noise = torch.randn_like(latents)
@@ -532,8 +557,7 @@ def collate_fn(examples):
                 noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
 
                 # Get the text embedding for conditioning
-                with torch.no_grad():
-                    encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
 
                 # Predict the noise residual
                 noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
@@ -556,7 +580,12 @@ def collate_fn(examples):
 
                 accelerator.backward(loss)
                 if accelerator.sync_gradients:
-                    accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
+                    params_to_clip = (
+                        itertools.chain(unet.parameters(), text_encoder.parameters())
+                        if args.train_text_encoder
+                        else unet.parameters()
+                    )
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
                 optimizer.step()
                 lr_scheduler.step()
                 optimizer.zero_grad()
@@ -578,7 +607,9 @@ def collate_fn(examples):
     # Create the pipeline using using the trained modules and save it.
     if accelerator.is_main_process:
         pipeline = StableDiffusionPipeline.from_pretrained(
-            args.pretrained_model_name_or_path, unet=accelerator.unwrap_model(unet)
+            args.pretrained_model_name_or_path,
+            unet=accelerator.unwrap_model(unet),
+            text_encoder=accelerator.unwrap_model(text_encoder),
         )
         pipeline.save_pretrained(args.output_dir)
 

From a9908ecfc1730bfd1a2fe0dfc01125fef67746a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=BDilvinas=20Ledas?= <zilvinas.ledas@dict.lt>
Date: Tue, 18 Oct 2022 18:44:01 +0300
Subject: [PATCH 004/269] Stable Diffusion image-to-image and inpaint using
 onnx. (#552)

* * Stabe Diffusion img2img using onnx.

* * Stabe Diffusion inpaint using onnx.

* Export vae_encoder, upgrade img2img, add test

* updated inpainting pipeline + test

* style

Co-authored-by: anton-l <anton@huggingface.co>
---
 ...ert_stable_diffusion_checkpoint_to_onnx.py |   7 +
 src/diffusers/__init__.py                     |   7 +-
 src/diffusers/pipelines/__init__.py           |   7 +-
 .../pipelines/stable_diffusion/__init__.py    |   2 +
 .../pipeline_onnx_stable_diffusion.py         |   2 +
 .../pipeline_onnx_stable_diffusion_img2img.py | 361 ++++++++++++++++
 .../pipeline_onnx_stable_diffusion_inpaint.py | 385 ++++++++++++++++++
 tests/test_pipelines.py                       |  68 ++++
 8 files changed, 837 insertions(+), 2 deletions(-)
 create mode 100644 src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
 create mode 100644 src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py

diff --git a/scripts/convert_stable_diffusion_checkpoint_to_onnx.py b/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
index a388c0d078dc..fb11616fc3b5 100644
--- a/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
+++ b/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
@@ -93,6 +93,7 @@ def convert_models(model_path: str, output_path: str, opset: int):
         },
         opset=opset,
     )
+    del pipeline.text_encoder
 
     # UNET
     unet_path = output_path / "unet" / "model.onnx"
@@ -125,6 +126,7 @@ def convert_models(model_path: str, output_path: str, opset: int):
         location="weights.pb",
         convert_attribute=False,
     )
+    del pipeline.unet
 
     # VAE ENCODER
     vae_encoder = pipeline.vae
@@ -157,6 +159,7 @@ def convert_models(model_path: str, output_path: str, opset: int):
         },
         opset=opset,
     )
+    del pipeline.vae
 
     # SAFETY CHECKER
     safety_checker = pipeline.safety_checker
@@ -173,8 +176,10 @@ def convert_models(model_path: str, output_path: str, opset: int):
         },
         opset=opset,
     )
+    del pipeline.safety_checker
 
     onnx_pipeline = StableDiffusionOnnxPipeline(
+        vae_encoder=OnnxRuntimeModel.from_pretrained(output_path / "vae_encoder"),
         vae_decoder=OnnxRuntimeModel.from_pretrained(output_path / "vae_decoder"),
         text_encoder=OnnxRuntimeModel.from_pretrained(output_path / "text_encoder"),
         tokenizer=pipeline.tokenizer,
@@ -187,6 +192,8 @@ def convert_models(model_path: str, output_path: str, opset: int):
     onnx_pipeline.save_pretrained(output_path)
     print("ONNX pipeline saved to", output_path)
 
+    del pipeline
+    del onnx_pipeline
     _ = StableDiffusionOnnxPipeline.from_pretrained(output_path, provider="CPUExecutionProvider")
     print("ONNX pipeline is loadable")
 
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 0041030c2666..b12993225759 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -58,7 +58,12 @@
     from .utils.dummy_torch_and_transformers_objects import *  # noqa F403
 
 if is_torch_available() and is_transformers_available() and is_onnx_available():
-    from .pipelines import OnnxStableDiffusionPipeline, StableDiffusionOnnxPipeline
+    from .pipelines import (
+        OnnxStableDiffusionImg2ImgPipeline,
+        OnnxStableDiffusionInpaintPipeline,
+        OnnxStableDiffusionPipeline,
+        StableDiffusionOnnxPipeline,
+    )
 else:
     from .utils.dummy_torch_and_transformers_and_onnx_objects import *  # noqa F403
 
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 01391b0db362..93eaf2dac678 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -20,7 +20,12 @@
     )
 
 if is_transformers_available() and is_onnx_available():
-    from .stable_diffusion import OnnxStableDiffusionPipeline, StableDiffusionOnnxPipeline
+    from .stable_diffusion import (
+        OnnxStableDiffusionImg2ImgPipeline,
+        OnnxStableDiffusionInpaintPipeline,
+        OnnxStableDiffusionPipeline,
+        StableDiffusionOnnxPipeline,
+    )
 
 if is_transformers_available() and is_flax_available():
     from .stable_diffusion import FlaxStableDiffusionPipeline
diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
index b1f3240f3ee3..cc1ff35dbad8 100644
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -35,6 +35,8 @@ class StableDiffusionPipelineOutput(BaseOutput):
 
 if is_transformers_available() and is_onnx_available():
     from .pipeline_onnx_stable_diffusion import OnnxStableDiffusionPipeline, StableDiffusionOnnxPipeline
+    from .pipeline_onnx_stable_diffusion_img2img import OnnxStableDiffusionImg2ImgPipeline
+    from .pipeline_onnx_stable_diffusion_inpaint import OnnxStableDiffusionInpaintPipeline
 
 if is_transformers_available() and is_flax_available():
     import flax
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
index 91bf5012362e..9fd0a635a8d4 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
@@ -26,6 +26,7 @@ class OnnxStableDiffusionPipeline(DiffusionPipeline):
 
     def __init__(
         self,
+        vae_encoder: OnnxRuntimeModel,
         vae_decoder: OnnxRuntimeModel,
         text_encoder: OnnxRuntimeModel,
         tokenizer: CLIPTokenizer,
@@ -36,6 +37,7 @@ def __init__(
     ):
         super().__init__()
         self.register_modules(
+            vae_encoder=vae_encoder,
             vae_decoder=vae_decoder,
             text_encoder=text_encoder,
             tokenizer=tokenizer,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
new file mode 100644
index 000000000000..c7fcef7aa9a0
--- /dev/null
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -0,0 +1,361 @@
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+
+import PIL
+from transformers import CLIPFeatureExtractor, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...onnx_utils import OnnxRuntimeModel
+from ...pipeline_utils import DiffusionPipeline
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import deprecate, logging
+from . import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def preprocess(image):
+    w, h = image.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    return 2.0 * image - 1.0
+
+
+class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-guided image to image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+    vae_encoder: OnnxRuntimeModel
+    vae_decoder: OnnxRuntimeModel
+    text_encoder: OnnxRuntimeModel
+    tokenizer: CLIPTokenizer
+    unet: OnnxRuntimeModel
+    scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
+    safety_checker: OnnxRuntimeModel
+    feature_extractor: CLIPFeatureExtractor
+
+    def __init__(
+        self,
+        vae_encoder: OnnxRuntimeModel,
+        vae_decoder: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: OnnxRuntimeModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: OnnxRuntimeModel,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae_encoder=vae_encoder,
+            vae_decoder=vae_decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        init_image: Union[np.ndarray, PIL.Image.Image],
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            init_image (`np.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
+                `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter will be modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        if isinstance(init_image, PIL.Image.Image):
+            init_image = preprocess(init_image)
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="np",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+
+        # duplicate text embeddings for each generation per prompt
+        text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""]
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError("The length of `negative_prompt` should be equal to batch_size.")
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            uncond_input_ids = uncond_input.input_ids
+            uncond_embeddings = self.text_encoder(input_ids=uncond_input_ids.astype(np.int32))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt
+            uncond_embeddings = np.repeat(uncond_embeddings, batch_size * num_images_per_prompt, axis=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
+
+        # encode the init image into latents and scale the latents
+        init_latents = self.vae_encoder(sample=init_image)[0]
+        init_latents = 0.18215 * init_latents
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        if len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many init images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = len(prompt) // init_latents.shape[0]
+            init_latents = np.concatenate([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0)
+        elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
+            )
+        else:
+            init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0)
+
+        # get the original timestep using init_timestep
+        offset = self.scheduler.config.get("steps_offset", 0)
+        init_timestep = int(num_inference_steps * strength) + offset
+        init_timestep = min(init_timestep, num_inference_steps)
+
+        timesteps = self.scheduler.timesteps[-init_timestep]
+        timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt, device=self.device)
+
+        # add noise to latents using the timesteps
+        noise = np.random.randn(*init_latents.shape).astype(np.float32)
+        init_latents = self.scheduler.add_noise(torch.from_numpy(init_latents), torch.from_numpy(noise), timesteps)
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        latents = init_latents
+
+        t_start = max(num_inference_steps - init_timestep + offset, 0)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps = self.scheduler.timesteps[t_start:].to(self.device)
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(
+                sample=latent_model_input, timestep=np.array([t]), encoder_hidden_states=text_embeddings
+            )[0]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+            latents = latents.numpy()
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae_decoder(latent_sample=latents)[0]
+
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = image.transpose((0, 2, 3, 1))
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="np")
+            image, has_nsfw_concept = self.safety_checker(clip_input=safety_checker_input.pixel_values, images=image)
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
new file mode 100644
index 000000000000..1ff34dd38378
--- /dev/null
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -0,0 +1,385 @@
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+
+import PIL
+from tqdm.auto import tqdm
+from transformers import CLIPFeatureExtractor, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...onnx_utils import OnnxRuntimeModel
+from ...pipeline_utils import DiffusionPipeline
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import deprecate, logging
+from . import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)
+
+
+def preprocess_image(image):
+    w, h = image.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    return 2.0 * image - 1.0
+
+
+def preprocess_mask(mask):
+    mask = mask.convert("L")
+    w, h = mask.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST)
+    mask = np.array(mask).astype(np.float32) / 255.0
+    mask = np.tile(mask, (4, 1, 1))
+    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
+    mask = 1 - mask  # repaint white, keep black
+    return mask
+
+
+class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+    vae_encoder: OnnxRuntimeModel
+    vae_decoder: OnnxRuntimeModel
+    text_encoder: OnnxRuntimeModel
+    tokenizer: CLIPTokenizer
+    unet: OnnxRuntimeModel
+    scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
+    safety_checker: OnnxRuntimeModel
+    feature_extractor: CLIPFeatureExtractor
+
+    def __init__(
+        self,
+        vae_encoder: OnnxRuntimeModel,
+        vae_decoder: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: OnnxRuntimeModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: OnnxRuntimeModel,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+        logger.info("`OnnxStableDiffusionInpaintPipeline` is experimental and will very likely change in the future.")
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae_encoder=vae_encoder,
+            vae_decoder=vae_decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        init_image: Union[np.ndarray, PIL.Image.Image],
+        mask_image: Union[np.ndarray, PIL.Image.Image],
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            init_image (`np.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. This is the image whose masked region will be inpainted.
+            mask_image (`np.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
+                is 1, the denoising process will be run on the masked area for the full number of iterations specified
+                in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
+                noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
+                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="np",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+
+        # duplicate text embeddings for each generation per prompt
+        text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""]
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            uncond_input_ids = uncond_input.input_ids
+            uncond_embeddings = self.text_encoder(input_ids=uncond_input_ids.astype(np.int32))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt
+            uncond_embeddings = np.repeat(uncond_embeddings, batch_size * num_images_per_prompt, axis=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
+
+        # preprocess image
+        if not isinstance(init_image, torch.FloatTensor):
+            init_image = preprocess_image(init_image)
+
+        # encode the init image into latents and scale the latents
+        init_latents = self.vae_encoder(sample=init_image)[0]
+        init_latents = 0.18215 * init_latents
+
+        # Expand init_latents for batch_size and num_images_per_prompt
+        init_latents = np.concatenate([init_latents] * batch_size * num_images_per_prompt, axis=0)
+        init_latents_orig = init_latents
+
+        # preprocess mask
+        if not isinstance(mask_image, np.ndarray):
+            mask_image = preprocess_mask(mask_image)
+        mask = np.concatenate([mask_image] * batch_size * num_images_per_prompt)
+
+        # check sizes
+        if not mask.shape == init_latents.shape:
+            raise ValueError("The mask and init_image should be the same size!")
+
+        # get the original timestep using init_timestep
+        offset = self.scheduler.config.get("steps_offset", 0)
+        init_timestep = int(num_inference_steps * strength) + offset
+        init_timestep = min(init_timestep, num_inference_steps)
+
+        timesteps = self.scheduler.timesteps[-init_timestep]
+        timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt, device=self.device)
+
+        # add noise to latents using the timesteps
+        noise = np.random.randn(*init_latents.shape).astype(np.float32)
+        init_latents = self.scheduler.add_noise(torch.from_numpy(init_latents), torch.from_numpy(noise), timesteps)
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        latents = init_latents
+
+        t_start = max(num_inference_steps - init_timestep + offset, 0)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps = self.scheduler.timesteps[t_start:].to(self.device)
+
+        for i, t in tqdm(enumerate(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(
+                sample=latent_model_input, timestep=np.array([t]), encoder_hidden_states=text_embeddings
+            )[0]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+            latents = latents.numpy()
+            # masking
+            init_latents_proper = self.scheduler.add_noise(
+                torch.from_numpy(init_latents_orig), torch.from_numpy(noise), torch.tensor([t])
+            )
+
+            latents = (init_latents_proper * mask) + (latents * (1 - mask))
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae_decoder(latent_sample=latents)[0]
+
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = image.transpose((0, 2, 3, 1))
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="np")
+            image, has_nsfw_concept = self.safety_checker(clip_input=safety_checker_input.pixel_values, images=image)
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 659e69554b94..369ba5e9656d 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -37,6 +37,8 @@
     LDMPipeline,
     LDMTextToImagePipeline,
     LMSDiscreteScheduler,
+    OnnxStableDiffusionImg2ImgPipeline,
+    OnnxStableDiffusionInpaintPipeline,
     OnnxStableDiffusionPipeline,
     PNDMPipeline,
     PNDMScheduler,
@@ -2025,6 +2027,72 @@ def test_stable_diffusion_onnx(self):
         expected_slice = np.array([0.3602, 0.3688, 0.3652, 0.3895, 0.3782, 0.3747, 0.3927, 0.4241, 0.4327])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
+    @slow
+    def test_stable_diffusion_img2img_onnx(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        )
+        init_image = init_image.resize((768, 512))
+
+        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", revision="onnx", provider="CPUExecutionProvider"
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A fantasy landscape, trending on artstation"
+
+        np.random.seed(0)
+        output = pipe(
+            prompt=prompt,
+            init_image=init_image,
+            strength=0.75,
+            guidance_scale=7.5,
+            num_inference_steps=8,
+            output_type="np",
+        )
+        images = output.images
+        image_slice = images[0, 255:258, 383:386, -1]
+
+        assert images.shape == (1, 512, 768, 3)
+        expected_slice = np.array([[0.4806, 0.5125, 0.5453, 0.4846, 0.4984, 0.4955, 0.4830, 0.4962, 0.4969]])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    @slow
+    def test_stable_diffusion_inpaint_onnx(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
+        )
+
+        pipe = OnnxStableDiffusionInpaintPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", revision="onnx", provider="CPUExecutionProvider"
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A red cat sitting on a park bench"
+
+        np.random.seed(0)
+        output = pipe(
+            prompt=prompt,
+            init_image=init_image,
+            mask_image=mask_image,
+            strength=0.75,
+            guidance_scale=7.5,
+            num_inference_steps=8,
+            output_type="np",
+        )
+        images = output.images
+        image_slice = images[0, 255:258, 255:258, -1]
+
+        assert images.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.3524, 0.3289, 0.3464, 0.3872, 0.4129, 0.3566, 0.3709, 0.4128, 0.3734])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
     @slow
     @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
     def test_stable_diffusion_text2img_intermediate_state(self):

From 8eb9d9703d0fe66828f118b302faf668bdb1ce34 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Wed, 19 Oct 2022 11:26:32 +0200
Subject: [PATCH 005/269] Improve ONNX img2img numpy handling, temporarily fix
 the tests (#899)

* [WIP] Onnx img2img determinism

* more numpy + seed

* numpy inpainting, tolerance

* revert test workflow
---
 .../convert_stable_diffusion_checkpoint_to_onnx.py |  6 +++---
 .../pipeline_onnx_stable_diffusion_img2img.py      | 14 +++++++-------
 .../pipeline_onnx_stable_diffusion_inpaint.py      | 14 +++++++-------
 tests/test_pipelines.py                            |  6 +++---
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/scripts/convert_stable_diffusion_checkpoint_to_onnx.py b/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
index fb11616fc3b5..a9bf4a6c1698 100644
--- a/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
+++ b/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
@@ -21,7 +21,7 @@
 from torch.onnx import export
 
 import onnx
-from diffusers import StableDiffusionOnnxPipeline, StableDiffusionPipeline
+from diffusers import OnnxStableDiffusionPipeline, StableDiffusionPipeline
 from diffusers.onnx_utils import OnnxRuntimeModel
 from packaging import version
 
@@ -178,7 +178,7 @@ def convert_models(model_path: str, output_path: str, opset: int):
     )
     del pipeline.safety_checker
 
-    onnx_pipeline = StableDiffusionOnnxPipeline(
+    onnx_pipeline = OnnxStableDiffusionPipeline(
         vae_encoder=OnnxRuntimeModel.from_pretrained(output_path / "vae_encoder"),
         vae_decoder=OnnxRuntimeModel.from_pretrained(output_path / "vae_decoder"),
         text_encoder=OnnxRuntimeModel.from_pretrained(output_path / "text_encoder"),
@@ -194,7 +194,7 @@ def convert_models(model_path: str, output_path: str, opset: int):
 
     del pipeline
     del onnx_pipeline
-    _ = StableDiffusionOnnxPipeline.from_pretrained(output_path, provider="CPUExecutionProvider")
+    _ = OnnxStableDiffusionPipeline.from_pretrained(output_path, provider="CPUExecutionProvider")
     print("ONNX pipeline is loadable")
 
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
index c7fcef7aa9a0..89cc054a424e 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -293,12 +293,15 @@ def __call__(
         init_timestep = int(num_inference_steps * strength) + offset
         init_timestep = min(init_timestep, num_inference_steps)
 
-        timesteps = self.scheduler.timesteps[-init_timestep]
-        timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt, device=self.device)
+        timesteps = self.scheduler.timesteps.numpy()[-init_timestep]
+        timesteps = np.array([timesteps] * batch_size * num_images_per_prompt)
 
         # add noise to latents using the timesteps
         noise = np.random.randn(*init_latents.shape).astype(np.float32)
-        init_latents = self.scheduler.add_noise(torch.from_numpy(init_latents), torch.from_numpy(noise), timesteps)
+        init_latents = self.scheduler.add_noise(
+            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
+        )
+        init_latents = init_latents.numpy()
 
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
@@ -312,10 +315,7 @@ def __call__(
         latents = init_latents
 
         t_start = max(num_inference_steps - init_timestep + offset, 0)
-
-        # Some schedulers like PNDM have timesteps as arrays
-        # It's more optimized to move all timesteps to correct device beforehand
-        timesteps = self.scheduler.timesteps[t_start:].to(self.device)
+        timesteps = self.scheduler.timesteps[t_start:].numpy()
 
         for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
index 1ff34dd38378..3e23ef29eef9 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -311,12 +311,15 @@ def __call__(
         init_timestep = int(num_inference_steps * strength) + offset
         init_timestep = min(init_timestep, num_inference_steps)
 
-        timesteps = self.scheduler.timesteps[-init_timestep]
-        timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt, device=self.device)
+        timesteps = self.scheduler.timesteps.numpy()[-init_timestep]
+        timesteps = np.array([timesteps] * batch_size * num_images_per_prompt)
 
         # add noise to latents using the timesteps
         noise = np.random.randn(*init_latents.shape).astype(np.float32)
-        init_latents = self.scheduler.add_noise(torch.from_numpy(init_latents), torch.from_numpy(noise), timesteps)
+        init_latents = self.scheduler.add_noise(
+            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
+        )
+        init_latents = init_latents.numpy()
 
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
@@ -330,10 +333,7 @@ def __call__(
         latents = init_latents
 
         t_start = max(num_inference_steps - init_timestep + offset, 0)
-
-        # Some schedulers like PNDM have timesteps as arrays
-        # It's more optimized to move all timesteps to correct device beforehand
-        timesteps = self.scheduler.timesteps[t_start:].to(self.device)
+        timesteps = self.scheduler.timesteps[t_start:].numpy()
 
         for i, t in tqdm(enumerate(timesteps)):
             # expand the latents if we are doing classifier free guidance
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 369ba5e9656d..44811eeb0367 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -2034,7 +2034,6 @@ def test_stable_diffusion_img2img_onnx(self):
             "/img2img/sketch-mountains-input.jpg"
         )
         init_image = init_image.resize((768, 512))
-
         pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(
             "CompVis/stable-diffusion-v1-4", revision="onnx", provider="CPUExecutionProvider"
         )
@@ -2055,8 +2054,9 @@ def test_stable_diffusion_img2img_onnx(self):
         image_slice = images[0, 255:258, 383:386, -1]
 
         assert images.shape == (1, 512, 768, 3)
-        expected_slice = np.array([[0.4806, 0.5125, 0.5453, 0.4846, 0.4984, 0.4955, 0.4830, 0.4962, 0.4969]])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+        expected_slice = np.array([0.4830, 0.5242, 0.5603, 0.5016, 0.5131, 0.5111, 0.4928, 0.5025, 0.5055])
+        # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2
 
     @slow
     def test_stable_diffusion_inpaint_onnx(self):

From bd216073fe3624693cc57c27c2a9ef649e02f71d Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 19 Oct 2022 12:31:53 +0200
Subject: [PATCH 006/269] make fix copies

---
 ...torch_and_transformers_and_onnx_objects.py | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
index 72ca97f51450..221020030ea1 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
@@ -4,6 +4,36 @@
 from ..utils import DummyObject, requires_backends
 
 
+class OnnxStableDiffusionImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "onnx"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+
+class OnnxStableDiffusionInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "onnx"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+
 class OnnxStableDiffusionPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers", "onnx"]
 

From 6ea83608adcf9302d3d24733dc319ab4ea9607ad Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 19 Oct 2022 12:55:37 +0200
Subject: [PATCH 007/269] [Stable Diffusion Inpainting] Deprecate inpainting
 pipeline in favor of official one (#903)

* finish

* up

* Apply suggestions from code review

Co-authored-by: Anton Lozhkov <anton@huggingface.co>

* Update src/diffusers/pipeline_utils.py

* Finish

Co-authored-by: Anton Lozhkov <anton@huggingface.co>
---
 src/diffusers/__init__.py                     |   1 +
 src/diffusers/pipeline_utils.py               |  20 +
 src/diffusers/pipelines/__init__.py           |   1 +
 .../pipelines/stable_diffusion/__init__.py    |   1 +
 .../pipeline_stable_diffusion_inpaint.py      |   4 +-
 ...ipeline_stable_diffusion_inpaint_legacy.py | 407 ++++++++++++++++++
 6 files changed, 432 insertions(+), 2 deletions(-)
 create mode 100644 src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index b12993225759..f36d3284777e 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -52,6 +52,7 @@
         LDMTextToImagePipeline,
         StableDiffusionImg2ImgPipeline,
         StableDiffusionInpaintPipeline,
+        StableDiffusionInpaintPipelineLegacy,
         StableDiffusionPipeline,
     )
 else:
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index d3e6113dac8e..37bc228b10e6 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -40,6 +40,7 @@
     ONNX_WEIGHTS_NAME,
     WEIGHTS_NAME,
     BaseOutput,
+    deprecate,
     is_transformers_available,
     logging,
 )
@@ -413,6 +414,25 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             diffusers_module = importlib.import_module(cls.__module__.split(".")[0])
             pipeline_class = getattr(diffusers_module, config_dict["_class_name"])
 
+        # To be removed in 1.0.0
+        if pipeline_class.__name__ == "StableDiffusionInpaintPipeline" and version.parse(
+            version.parse(config_dict["_diffusers_version"]).base_version
+        ) <= version.parse("0.5.1"):
+            from diffusers import StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy
+
+            pipeline_class = StableDiffusionInpaintPipelineLegacy
+
+            deprecation_message = (
+                "You are using a legacy checkpoint for inpainting with Stable Diffusion, therefore we are loading the"
+                f" {StableDiffusionInpaintPipelineLegacy} class instead of {StableDiffusionInpaintPipeline}. For"
+                " better inpainting results, we strongly suggest using Stable Diffusion's official inpainting"
+                " checkpoint: https://huggingface.co/runwayml/stable-diffusion-inpainting instead or adapting your"
+                f" checkpoint {pretrained_model_name_or_path} to the format of"
+                " https://huggingface.co/runwayml/stable-diffusion-inpainting. Note that we do not actively maintain"
+                " the {StableDiffusionInpaintPipelineLegacy} class and will likely remove it in version 1.0.0."
+            )
+            deprecate("StableDiffusionInpaintPipelineLegacy", "1.0.0", deprecation_message, standard_warn=False)
+
         # some modules can be passed directly to the init
         # in this case they are already instantiated in `kwargs`
         # extract them here
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 93eaf2dac678..3c4f877fa830 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -16,6 +16,7 @@
     from .stable_diffusion import (
         StableDiffusionImg2ImgPipeline,
         StableDiffusionInpaintPipeline,
+        StableDiffusionInpaintPipelineLegacy,
         StableDiffusionPipeline,
     )
 
diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
index cc1ff35dbad8..5a452138b7a8 100644
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -31,6 +31,7 @@ class StableDiffusionPipelineOutput(BaseOutput):
     from .pipeline_stable_diffusion import StableDiffusionPipeline
     from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
     from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
+    from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy
     from .safety_checker import StableDiffusionSafetyChecker
 
 if is_transformers_available() and is_onnx_available():
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index cb4d552a6f50..077a7b0748d9 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -82,8 +82,6 @@ def __init__(
         feature_extractor: CLIPFeatureExtractor,
     ):
         super().__init__()
-        logger.info("`StableDiffusionInpaintPipeline` is experimental and will very likely change in the future.")
-
         if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
@@ -223,6 +221,8 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
+        # TODO(Suraj) - adapt to your use case
+
         if isinstance(prompt, str):
             batch_size = 1
         elif isinstance(prompt, list):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
new file mode 100644
index 000000000000..038178dd9a0a
--- /dev/null
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -0,0 +1,407 @@
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+
+import PIL
+from tqdm.auto import tqdm
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...pipeline_utils import DiffusionPipeline
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import deprecate, logging
+from . import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)
+
+
+def preprocess_image(image):
+    w, h = image.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+
+def preprocess_mask(mask):
+    mask = mask.convert("L")
+    w, h = mask.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST)
+    mask = np.array(mask).astype(np.float32) / 255.0
+    mask = np.tile(mask, (4, 1, 1))
+    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
+    mask = 1 - mask  # repaint white, keep black
+    mask = torch.from_numpy(mask)
+    return mask
+
+
+class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
+    r"""
+    Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warn(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `set_attention_slice`
+        self.enable_attention_slicing(None)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        init_image: Union[torch.FloatTensor, PIL.Image.Image],
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image],
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            init_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. This is the image whose masked region will be inpainted.
+            mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
+                is 1, the denoising process will be run on the masked area for the full number of iterations specified
+                in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
+                noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
+                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+
+        # duplicate text embeddings for each generation per prompt
+        text_embeddings = text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""]
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt
+            uncond_embeddings = uncond_embeddings.repeat_interleave(batch_size * num_images_per_prompt, dim=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # preprocess image
+        if not isinstance(init_image, torch.FloatTensor):
+            init_image = preprocess_image(init_image)
+
+        # encode the init image into latents and scale the latents
+        latents_dtype = text_embeddings.dtype
+        init_image = init_image.to(device=self.device, dtype=latents_dtype)
+        init_latent_dist = self.vae.encode(init_image).latent_dist
+        init_latents = init_latent_dist.sample(generator=generator)
+        init_latents = 0.18215 * init_latents
+
+        # Expand init_latents for batch_size and num_images_per_prompt
+        init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
+        init_latents_orig = init_latents
+
+        # preprocess mask
+        if not isinstance(mask_image, torch.FloatTensor):
+            mask_image = preprocess_mask(mask_image)
+        mask_image = mask_image.to(device=self.device, dtype=latents_dtype)
+        mask = torch.cat([mask_image] * batch_size * num_images_per_prompt)
+
+        # check sizes
+        if not mask.shape == init_latents.shape:
+            raise ValueError("The mask and init_image should be the same size!")
+
+        # get the original timestep using init_timestep
+        offset = self.scheduler.config.get("steps_offset", 0)
+        init_timestep = int(num_inference_steps * strength) + offset
+        init_timestep = min(init_timestep, num_inference_steps)
+
+        timesteps = self.scheduler.timesteps[-init_timestep]
+        timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt, device=self.device)
+
+        # add noise to latents using the timesteps
+        noise = torch.randn(init_latents.shape, generator=generator, device=self.device, dtype=latents_dtype)
+        init_latents = self.scheduler.add_noise(init_latents, noise, timesteps)
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        latents = init_latents
+
+        t_start = max(num_inference_steps - init_timestep + offset, 0)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps = self.scheduler.timesteps[t_start:].to(self.device)
+
+        for i, t in tqdm(enumerate(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+            # masking
+            init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+
+            latents = (init_latents_proper * mask) + (latents * (1 - mask))
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
+                self.device
+            )
+            image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_checker_input.pixel_values)
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

From 83b696e6c088b25da092bc42feff1094a2f4d57f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 19 Oct 2022 15:54:07 +0200
Subject: [PATCH 008/269] [Communit Pipeline] Make sure "mega" uses correct
 inpaint pipeline (#908)

---
 examples/community/stable_diffusion_mega.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/community/stable_diffusion_mega.py b/examples/community/stable_diffusion_mega.py
index 67d39f545544..b253370fa2a6 100644
--- a/examples/community/stable_diffusion_mega.py
+++ b/examples/community/stable_diffusion_mega.py
@@ -10,7 +10,7 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
     StableDiffusionImg2ImgPipeline,
-    StableDiffusionInpaintPipeline,
+    StableDiffusionInpaintPipelineLegacy,
     StableDiffusionPipeline,
     UNet2DConditionModel,
 )
@@ -136,7 +136,7 @@ def inpaint(
         callback_steps: Optional[int] = 1,
     ):
         # For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline
-        return StableDiffusionInpaintPipeline(**self.components)(
+        return StableDiffusionInpaintPipelineLegacy(**self.components)(
             prompt=prompt,
             init_image=init_image,
             mask_image=mask_image,

From b35d88c536081c7e12c4605cf5b7c9ae2e20af72 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Wed, 19 Oct 2022 16:11:50 +0200
Subject: [PATCH 009/269] Stable diffusion inpainting. (#904)

* begin pipe

* add new pipeline

* add tests

* correct fast test

* up

* Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py

* Update tests/test_pipelines.py

* up

* up

* make style

* add fp16 test

* doc, comments

* up

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Anton Lozhkov <anton@huggingface.co>
---
 .../pipeline_stable_diffusion_inpaint.py      | 213 ++++++++--------
 tests/test_pipelines.py                       | 231 ++++++++++++++++--
 2 files changed, 333 insertions(+), 111 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 077a7b0748d9..abcc7fba6e8a 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -5,7 +5,6 @@
 import torch
 
 import PIL
-from tqdm.auto import tqdm
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -17,30 +16,24 @@
 from .safety_checker import StableDiffusionSafetyChecker
 
 
-logger = logging.get_logger(__name__)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-def preprocess_image(image):
-    w, h = image.size
-    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
-    image = np.array(image).astype(np.float32) / 255.0
+def prepare_mask_and_masked_image(image, mask):
+    image = np.array(image.convert("RGB"))
     image = image[None].transpose(0, 3, 1, 2)
-    image = torch.from_numpy(image)
-    return 2.0 * image - 1.0
-
-
-def preprocess_mask(mask):
-    mask = mask.convert("L")
-    w, h = mask.size
-    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST)
-    mask = np.array(mask).astype(np.float32) / 255.0
-    mask = np.tile(mask, (4, 1, 1))
-    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
-    mask = 1 - mask  # repaint white, keep black
+    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+    mask = np.array(mask.convert("L"))
+    mask = mask.astype(np.float32) / 255.0
+    mask = mask[None, None]
+    mask[mask < 0.5] = 0
+    mask[mask >= 0.5] = 1
     mask = torch.from_numpy(mask)
-    return mask
+
+    masked_image = image * (mask < 0.5)
+
+    return mask, masked_image
 
 
 class StableDiffusionInpaintPipeline(DiffusionPipeline):
@@ -82,6 +75,7 @@ def __init__(
         feature_extractor: CLIPFeatureExtractor,
     ):
         super().__init__()
+
         if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
@@ -140,22 +134,24 @@ def disable_attention_slicing(self):
         Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
         back to computing attention in one step.
         """
-        # set slice_size = `None` to disable `set_attention_slice`
+        # set slice_size = `None` to disable `attention slicing`
         self.enable_attention_slicing(None)
 
     @torch.no_grad()
     def __call__(
         self,
         prompt: Union[str, List[str]],
-        init_image: Union[torch.FloatTensor, PIL.Image.Image],
+        image: Union[torch.FloatTensor, PIL.Image.Image],
         mask_image: Union[torch.FloatTensor, PIL.Image.Image],
-        strength: float = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.0,
+        eta: float = 0.0,
         generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -168,22 +164,21 @@ def __call__(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            init_image (`torch.FloatTensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process. This is the image whose masked region will be inpainted.
-            mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
-                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
-                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
-                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
-                is 1, the denoising process will be run on the masked area for the full number of iterations specified
-                in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
-                noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
-                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
-                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
             guidance_scale (`float`, *optional*, defaults to 7.5):
                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                 `guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -201,6 +196,10 @@ def __call__(
             generator (`torch.Generator`, *optional*):
                 A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
                 deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generate image. Choose between
                 [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -221,7 +220,6 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
-        # TODO(Suraj) - adapt to your use case
 
         if isinstance(prompt, str):
             batch_size = 1
@@ -230,8 +228,8 @@ def __call__(
         else:
             raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
             callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
@@ -241,9 +239,6 @@ def __call__(
                 f" {type(callback_steps)}."
             )
 
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-
         # get prompt text embeddings
         text_inputs = self.tokenizer(
             prompt,
@@ -262,8 +257,10 @@ def __call__(
             text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
         text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
 
-        # duplicate text embeddings for each generation per prompt
-        text_embeddings = text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -300,50 +297,78 @@ def __call__(
             )
             uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
 
-            # duplicate unconditional embeddings for each generation per prompt
-            uncond_embeddings = uncond_embeddings.repeat_interleave(batch_size * num_images_per_prompt, dim=0)
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
             text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
 
-        # preprocess image
-        if not isinstance(init_image, torch.FloatTensor):
-            init_image = preprocess_image(init_image)
-
-        # encode the init image into latents and scale the latents
+        # get the initial random noise unless the user supplied it
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        num_channels_latents = self.vae.config.latent_channels
+        latents_shape = (batch_size * num_images_per_prompt, num_channels_latents, height // 8, width // 8)
         latents_dtype = text_embeddings.dtype
-        init_image = init_image.to(device=self.device, dtype=latents_dtype)
-        init_latent_dist = self.vae.encode(init_image).latent_dist
-        init_latents = init_latent_dist.sample(generator=generator)
-        init_latents = 0.18215 * init_latents
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not exist on mps
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(self.device)
+
+        # prepare mask and masked_image
+        mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
+        mask = mask.to(device=self.device, dtype=text_embeddings.dtype)
+        masked_image = masked_image.to(device=self.device, dtype=text_embeddings.dtype)
 
-        # Expand init_latents for batch_size and num_images_per_prompt
-        init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
-        init_latents_orig = init_latents
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        mask = torch.nn.functional.interpolate(mask, size=(height // 8, width // 8))
+
+        # encode the mask image into latents space so we can concatenate it to the latents
+        masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
+        masked_image_latents = 0.18215 * masked_image_latents
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        mask = mask.repeat(num_images_per_prompt, 1, 1, 1)
+        masked_image_latents = masked_image_latents.repeat(num_images_per_prompt, 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
 
-        # preprocess mask
-        if not isinstance(mask_image, torch.FloatTensor):
-            mask_image = preprocess_mask(mask_image)
-        mask_image = mask_image.to(device=self.device, dtype=latents_dtype)
-        mask = torch.cat([mask_image] * batch_size * num_images_per_prompt)
+        num_channels_mask = mask.shape[1]
+        num_channels_masked_image = masked_image_latents.shape[1]
 
-        # check sizes
-        if not mask.shape == init_latents.shape:
-            raise ValueError("The mask and init_image should be the same size!")
+        if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                " `pipeline.unet` or your `mask_image` or `image` input."
+            )
 
-        # get the original timestep using init_timestep
-        offset = self.scheduler.config.get("steps_offset", 0)
-        init_timestep = int(num_inference_steps * strength) + offset
-        init_timestep = min(init_timestep, num_inference_steps)
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
 
-        timesteps = self.scheduler.timesteps[-init_timestep]
-        timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt, device=self.device)
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
 
-        # add noise to latents using the timesteps
-        noise = torch.randn(init_latents.shape, generator=generator, device=self.device, dtype=latents_dtype)
-        init_latents = self.scheduler.add_noise(init_latents, noise, timesteps)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
 
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
@@ -354,17 +379,13 @@ def __call__(
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
-        latents = init_latents
-
-        t_start = max(num_inference_steps - init_timestep + offset, 0)
-
-        # Some schedulers like PNDM have timesteps as arrays
-        # It's more optimized to move all timesteps to correct device beforehand
-        timesteps = self.scheduler.timesteps[t_start:].to(self.device)
-
-        for i, t in tqdm(enumerate(timesteps)):
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            # concat latents, mask, masked_image_latents in the channel dimension
+            latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
             latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
             # predict the noise residual
@@ -377,10 +398,6 @@ def __call__(
 
             # compute the previous noisy sample x_t -> x_t-1
             latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-            # masking
-            init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
-
-            latents = (init_latents_proper * mask) + (latents * (1 - mask))
 
             # call the callback, if provided
             if callback is not None and i % callback_steps == 0:
@@ -390,13 +407,17 @@ def __call__(
         image = self.vae.decode(latents).sample
 
         image = (image / 2 + 0.5).clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
 
         if self.safety_checker is not None:
             safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
                 self.device
             )
-            image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_checker_input.pixel_values)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+            )
         else:
             has_nsfw_concept = None
 
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 44811eeb0367..e40d01e198e1 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -46,6 +46,7 @@
     ScoreSdeVeScheduler,
     StableDiffusionImg2ImgPipeline,
     StableDiffusionInpaintPipeline,
+    StableDiffusionInpaintPipelineLegacy,
     StableDiffusionPipeline,
     UNet2DConditionModel,
     UNet2DModel,
@@ -189,6 +190,21 @@ def dummy_cond_unet(self):
         )
         return model
 
+    @property
+    def dummy_cond_unet_inpaint(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=9,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
     @property
     def dummy_vq_model(self):
         torch.manual_seed(0)
@@ -897,7 +913,7 @@ def test_stable_diffusion_img2img_k_lms(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
-    def test_stable_diffusion_inpaint(self):
+    def test_stable_diffusion_inpaint_legacy(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         unet = self.dummy_cond_unet
         scheduler = PNDMScheduler(skip_prk_steps=True)
@@ -910,7 +926,7 @@ def test_stable_diffusion_inpaint(self):
         mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
 
         # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionInpaintPipeline(
+        sd_pipe = StableDiffusionInpaintPipelineLegacy(
             unet=unet,
             scheduler=scheduler,
             vae=vae,
@@ -956,7 +972,66 @@ def test_stable_diffusion_inpaint(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
-    def test_stable_diffusion_inpaint_negative_prompt(self):
+    def test_stable_diffusion_inpaint(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet_inpaint
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((128, 128))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionInpaintPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=None,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            image=init_image,
+            mask_image=mask_image,
+        )
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            image=init_image,
+            mask_image=mask_image,
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.5075, 0.4485, 0.4558, 0.5369, 0.5369, 0.5236, 0.5127, 0.4983, 0.4776])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_inpaint_legacy_negative_prompt(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         unet = self.dummy_cond_unet
         scheduler = PNDMScheduler(skip_prk_steps=True)
@@ -969,7 +1044,7 @@ def test_stable_diffusion_inpaint_negative_prompt(self):
         mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
 
         # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionInpaintPipeline(
+        sd_pipe = StableDiffusionInpaintPipelineLegacy(
             unet=unet,
             scheduler=scheduler,
             vae=vae,
@@ -1122,7 +1197,7 @@ def test_stable_diffusion_img2img_num_images_per_prompt(self):
 
         assert images.shape == (batch_size * num_images_per_prompt, 32, 32, 3)
 
-    def test_stable_diffusion_inpaint_num_images_per_prompt(self):
+    def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
         device = "cpu"
         unet = self.dummy_cond_unet
         scheduler = PNDMScheduler(skip_prk_steps=True)
@@ -1135,7 +1210,7 @@ def test_stable_diffusion_inpaint_num_images_per_prompt(self):
         mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
 
         # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionInpaintPipeline(
+        sd_pipe = StableDiffusionInpaintPipelineLegacy(
             unet=unet,
             scheduler=scheduler,
             vae=vae,
@@ -1274,15 +1349,15 @@ def test_stable_diffusion_img2img_fp16(self):
 
     @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
     def test_stable_diffusion_inpaint_fp16(self):
-        """Test that stable diffusion inpaint works with fp16"""
-        unet = self.dummy_cond_unet
+        """Test that stable diffusion inpaint_legacy works with fp16"""
+        unet = self.dummy_cond_unet_inpaint
         scheduler = PNDMScheduler(skip_prk_steps=True)
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
         tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
 
         image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((128, 128))
         mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
 
         # put models in fp16
@@ -1297,8 +1372,8 @@ def test_stable_diffusion_inpaint_fp16(self):
             vae=vae,
             text_encoder=bert,
             tokenizer=tokenizer,
-            safety_checker=self.dummy_safety_checker,
-            feature_extractor=self.dummy_extractor,
+            safety_checker=None,
+            feature_extractor=None,
         )
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -1310,11 +1385,11 @@ def test_stable_diffusion_inpaint_fp16(self):
             generator=generator,
             num_inference_steps=2,
             output_type="np",
-            init_image=init_image,
+            image=init_image,
             mask_image=mask_image,
         ).images
 
-        assert image.shape == (1, 32, 32, 3)
+        assert image.shape == (1, 128, 128, 3)
 
 
 class PipelineTesterMixin(unittest.TestCase):
@@ -1924,6 +1999,90 @@ def test_stable_diffusion_img2img_pipeline_k_lms(self):
     @slow
     @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
     def test_stable_diffusion_inpaint_pipeline(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
+        )
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/yellow_cat_sitting_on_a_park_bench.png"
+        )
+        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+
+        model_id = "fusing/sd-inpaint-temp"
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            model_id,
+            safety_checker=self.dummy_safety_checker,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            mask_image=mask_image,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 1e-2
+
+    @slow
+    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
+    def test_stable_diffusion_inpaint_pipeline_fp16(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
+        )
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/yellow_cat_sitting_on_a_park_bench.png"
+        )
+        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+
+        model_id = "fusing/sd-inpaint-temp"
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            model_id,
+            revision="fp16",
+            torch_dtype=torch.float16,
+            safety_checker=None,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            mask_image=mask_image,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 1e-2
+
+    @slow
+    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
+    def test_stable_diffusion_inpaint_legacy_pipeline(self):
         init_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/in_paint/overture-creations-5sI6fQgYIuo.png"
@@ -1966,7 +2125,49 @@ def test_stable_diffusion_inpaint_pipeline(self):
 
     @slow
     @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-    def test_stable_diffusion_inpaint_pipeline_k_lms(self):
+    def test_stable_diffusion_inpaint_pipeline_pndm(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
+        )
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/yellow_cat_sitting_on_a_park_bench_pndm.png"
+        )
+        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+
+        pndm = PNDMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True)
+        model_id = "fusing/sd-inpaint-temp"
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            model_id, safety_checker=self.dummy_safety_checker, scheduler=pndm
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            mask_image=mask_image,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 1e-2
+
+    @slow
+    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
+    def test_stable_diffusion_inpaint_legacy_pipeline_k_lms(self):
+        # TODO(Anton, Patrick) - I think we can remove this test soon
         init_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/in_paint/overture-creations-5sI6fQgYIuo.png"
@@ -2199,7 +2400,7 @@ def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> No
 
     @slow
     @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-    def test_stable_diffusion_inpaint_intermediate_state(self):
+    def test_stable_diffusion_inpaint_legacy_intermediate_state(self):
         number_of_steps = 0
 
         def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:

From 46557121e6421e4d4e7f294584eac4961ef7c728 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 19 Oct 2022 16:36:51 +0200
Subject: [PATCH 010/269] finish tests (#909)

---
 tests/test_pipelines.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index e40d01e198e1..2a581e612613 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -2013,7 +2013,7 @@ def test_stable_diffusion_inpaint_pipeline(self):
         )
         expected_image = np.array(expected_image, dtype=np.float32) / 255.0
 
-        model_id = "fusing/sd-inpaint-temp"
+        model_id = "runwayml/stable-diffusion-inpainting"
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
             model_id,
             safety_checker=self.dummy_safety_checker,
@@ -2050,11 +2050,11 @@ def test_stable_diffusion_inpaint_pipeline_fp16(self):
         )
         expected_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/yellow_cat_sitting_on_a_park_bench.png"
+            "/in_paint/yellow_cat_sitting_on_a_park_bench_fp16.png"
         )
         expected_image = np.array(expected_image, dtype=np.float32) / 255.0
 
-        model_id = "fusing/sd-inpaint-temp"
+        model_id = "runwayml/stable-diffusion-inpainting"
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
             model_id,
             revision="fp16",
@@ -2141,7 +2141,7 @@ def test_stable_diffusion_inpaint_pipeline_pndm(self):
         expected_image = np.array(expected_image, dtype=np.float32) / 255.0
 
         pndm = PNDMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True)
-        model_id = "fusing/sd-inpaint-temp"
+        model_id = "runwayml/stable-diffusion-inpainting"
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
             model_id, safety_checker=self.dummy_safety_checker, scheduler=pndm
         )

From 89d124945add51b4218cf0a9028a3966cc9dfd47 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Wed, 19 Oct 2022 17:03:31 +0200
Subject: [PATCH 011/269] ONNX supervised inpainting (#906)

* ONNX supervised inpainting

* sync with the torch pipeline

* fix concat

* update ref values

* back to 8 steps

* type fix

* make fix-copies
---
 ...ert_stable_diffusion_checkpoint_to_onnx.py |   7 +-
 .../pipeline_onnx_stable_diffusion_inpaint.py | 166 +++++++++---------
 .../dummy_torch_and_transformers_objects.py   |  15 ++
 tests/test_pipelines.py                       |   7 +-
 4 files changed, 108 insertions(+), 87 deletions(-)

diff --git a/scripts/convert_stable_diffusion_checkpoint_to_onnx.py b/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
index a9bf4a6c1698..2a2c2031a7b5 100644
--- a/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
+++ b/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
@@ -99,7 +99,12 @@ def convert_models(model_path: str, output_path: str, opset: int):
     unet_path = output_path / "unet" / "model.onnx"
     onnx_export(
         pipeline.unet,
-        model_args=(torch.randn(2, 4, 64, 64), torch.LongTensor([0, 1]), torch.randn(2, 77, 768), False),
+        model_args=(
+            torch.randn(2, pipeline.unet.in_channels, 64, 64),
+            torch.LongTensor([0, 1]),
+            torch.randn(2, 77, 768),
+            False,
+        ),
         output_path=unet_path,
         ordered_input_names=["sample", "timestep", "encoder_hidden_states", "return_dict"],
         output_names=["out_sample"],  # has to be different from "sample" for correct tracing
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
index 3e23ef29eef9..30f8d7fcc3b8 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -5,7 +5,6 @@
 import torch
 
 import PIL
-from tqdm.auto import tqdm
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -16,28 +15,29 @@
 from . import StableDiffusionPipelineOutput
 
 
-logger = logging.get_logger(__name__)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-def preprocess_image(image):
-    w, h = image.size
-    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
-    image = np.array(image).astype(np.float32) / 255.0
+NUM_UNET_INPUT_CHANNELS = 9
+NUM_LATENT_CHANNELS = 4
+
+
+def prepare_mask_and_masked_image(image, mask, latents_shape):
+    image = np.array(image.convert("RGB"))
     image = image[None].transpose(0, 3, 1, 2)
-    return 2.0 * image - 1.0
+    image = image.astype(np.float32) / 127.5 - 1.0
 
+    image_mask = np.array(mask.convert("L"))
+    masked_image = image * (image_mask < 127.5)
 
-def preprocess_mask(mask):
-    mask = mask.convert("L")
-    w, h = mask.size
-    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST)
-    mask = np.array(mask).astype(np.float32) / 255.0
-    mask = np.tile(mask, (4, 1, 1))
-    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
-    mask = 1 - mask  # repaint white, keep black
-    return mask
+    mask = mask.resize((latents_shape[1], latents_shape[0]), PIL.Image.NEAREST)
+    mask = np.array(mask.convert("L"))
+    mask = mask.astype(np.float32) / 255.0
+    mask = mask[None, None]
+    mask[mask < 0.5] = 0
+    mask[mask >= 0.5] = 1
+
+    return mask, masked_image
 
 
 class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline):
@@ -129,14 +129,16 @@ def __init__(
     def __call__(
         self,
         prompt: Union[str, List[str]],
-        init_image: Union[np.ndarray, PIL.Image.Image],
-        mask_image: Union[np.ndarray, PIL.Image.Image],
-        strength: float = 0.8,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
+        image: PIL.Image.Image,
+        mask_image: PIL.Image.Image,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.0,
+        eta: float = 0.0,
+        latents: Optional[np.ndarray] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
@@ -149,22 +151,21 @@ def __call__(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            init_image (`np.ndarray` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, that will be used as the starting point for the
-                process. This is the image whose masked region will be inpainted.
-            mask_image (`np.ndarray` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
-                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
-                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
-                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
-            strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
-                is 1, the denoising process will be run on the masked area for the full number of iterations specified
-                in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
-                noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
-                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
-                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
             guidance_scale (`float`, *optional*, defaults to 7.5):
                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                 `guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -179,6 +180,10 @@ def __call__(
             eta (`float`, *optional*, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
+            latents (`np.ndarray`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generate image. Choose between
                 [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -206,8 +211,8 @@ def __call__(
         else:
             raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
         if (callback_steps is None) or (
             callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
@@ -285,41 +290,46 @@ def __call__(
             # to avoid doing two forward passes
             text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
 
-        # preprocess image
-        if not isinstance(init_image, torch.FloatTensor):
-            init_image = preprocess_image(init_image)
+        num_channels_latents = NUM_LATENT_CHANNELS
+        latents_shape = (batch_size * num_images_per_prompt, num_channels_latents, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            latents = np.random.randn(*latents_shape).astype(latents_dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
 
-        # encode the init image into latents and scale the latents
-        init_latents = self.vae_encoder(sample=init_image)[0]
-        init_latents = 0.18215 * init_latents
+        # prepare mask and masked_image
+        mask, masked_image = prepare_mask_and_masked_image(image, mask_image, latents_shape[-2:])
+        mask = mask.astype(latents.dtype)
+        masked_image = masked_image.astype(latents.dtype)
 
-        # Expand init_latents for batch_size and num_images_per_prompt
-        init_latents = np.concatenate([init_latents] * batch_size * num_images_per_prompt, axis=0)
-        init_latents_orig = init_latents
+        masked_image_latents = self.vae_encoder(sample=masked_image)[0]
+        masked_image_latents = 0.18215 * masked_image_latents
 
-        # preprocess mask
-        if not isinstance(mask_image, np.ndarray):
-            mask_image = preprocess_mask(mask_image)
-        mask = np.concatenate([mask_image] * batch_size * num_images_per_prompt)
+        mask = np.concatenate([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            np.concatenate([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
 
-        # check sizes
-        if not mask.shape == init_latents.shape:
-            raise ValueError("The mask and init_image should be the same size!")
+        num_channels_mask = mask.shape[1]
+        num_channels_masked_image = masked_image_latents.shape[1]
 
-        # get the original timestep using init_timestep
-        offset = self.scheduler.config.get("steps_offset", 0)
-        init_timestep = int(num_inference_steps * strength) + offset
-        init_timestep = min(init_timestep, num_inference_steps)
+        unet_input_channels = NUM_UNET_INPUT_CHANNELS
+        if num_channels_latents + num_channels_mask + num_channels_masked_image != unet_input_channels:
+            raise ValueError(
+                "Incorrect configuration settings! The config of `pipeline.unet` expects"
+                f" {unet_input_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                " `pipeline.unet` or your `mask_image` or `image` input."
+            )
 
-        timesteps = self.scheduler.timesteps.numpy()[-init_timestep]
-        timesteps = np.array([timesteps] * batch_size * num_images_per_prompt)
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
 
-        # add noise to latents using the timesteps
-        noise = np.random.randn(*init_latents.shape).astype(np.float32)
-        init_latents = self.scheduler.add_noise(
-            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
-        )
-        init_latents = init_latents.numpy()
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
 
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
@@ -330,15 +340,13 @@ def __call__(
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
-        latents = init_latents
-
-        t_start = max(num_inference_steps - init_timestep + offset, 0)
-        timesteps = self.scheduler.timesteps[t_start:].numpy()
-
-        for i, t in tqdm(enumerate(timesteps)):
+        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            # concat latents, mask, masked_image_latnets in the channel dimension
+            latent_model_input = np.concatenate([latent_model_input, mask, masked_image_latents], axis=1)
+            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
+            latent_model_input = latent_model_input.numpy()
 
             # predict the noise residual
             noise_pred = self.unet(
@@ -353,12 +361,6 @@ def __call__(
             # compute the previous noisy sample x_t -> x_t-1
             latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
             latents = latents.numpy()
-            # masking
-            init_latents_proper = self.scheduler.add_noise(
-                torch.from_numpy(init_latents_orig), torch.from_numpy(noise), torch.tensor([t])
-            )
-
-            latents = (init_latents_proper * mask) + (latents * (1 - mask))
 
             # call the callback, if provided
             if callback is not None and i % callback_steps == 0:
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index ce9a02bca150..615444425c44 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -49,6 +49,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class StableDiffusionInpaintPipelineLegacy(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class StableDiffusionPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 2a581e612613..357fb11a0e8c 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -2271,7 +2271,7 @@ def test_stable_diffusion_inpaint_onnx(self):
         )
 
         pipe = OnnxStableDiffusionInpaintPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="onnx", provider="CPUExecutionProvider"
+            "runwayml/stable-diffusion-inpainting", revision="onnx", provider="CPUExecutionProvider"
         )
         pipe.set_progress_bar_config(disable=None)
 
@@ -2280,9 +2280,8 @@ def test_stable_diffusion_inpaint_onnx(self):
         np.random.seed(0)
         output = pipe(
             prompt=prompt,
-            init_image=init_image,
+            image=init_image,
             mask_image=mask_image,
-            strength=0.75,
             guidance_scale=7.5,
             num_inference_steps=8,
             output_type="np",
@@ -2291,7 +2290,7 @@ def test_stable_diffusion_inpaint_onnx(self):
         image_slice = images[0, 255:258, 255:258, -1]
 
         assert images.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.3524, 0.3289, 0.3464, 0.3872, 0.4129, 0.3566, 0.3709, 0.4128, 0.3734])
+        expected_slice = np.array([0.2951, 0.2955, 0.2922, 0.2036, 0.1977, 0.2279, 0.1716, 0.1641, 0.1799])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
     @slow

From 8124863d1f60aa86427b87f251c8a06e590d7fa9 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Wed, 19 Oct 2022 17:31:23 +0200
Subject: [PATCH 012/269] Initial docs update for new in-painting pipeline
 (#910)

Docs update for new in-painting pipeline.

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 README.md                               | 28 ++++++++-----------
 docs/source/api/pipelines/overview.mdx  | 20 ++++++-------
 docs/source/using-diffusers/inpaint.mdx | 37 +++++++++++++++++--------
 src/diffusers/pipelines/README.md       | 20 ++++++-------
 4 files changed, 57 insertions(+), 48 deletions(-)

diff --git a/README.md b/README.md
index 734cba6efd9e..fe006f41c2c6 100644
--- a/README.md
+++ b/README.md
@@ -210,14 +210,16 @@ You can also run this example on colab [![Open In Colab](https://colab.research.
 
 ### In-painting using Stable Diffusion
 
-The `StableDiffusionInpaintPipeline` lets you edit specific parts of an image by providing a mask and text prompt.
+The `StableDiffusionInpaintPipeline` lets you edit specific parts of an image by providing a mask and a text prompt. It uses a model optimized for this particular task, whose license you need to accept before use.
 
-```python
-from io import BytesIO
+Please, visit the [model card](https://huggingface.co/runwayml/stable-diffusion-inpainting), read the license carefully and tick the checkbox if you agree. Note that this is an additional license, you need to accept it even if you accepted the text-to-image Stable Diffusion license in the past. You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section](https://huggingface.co/docs/hub/security-tokens) of the documentation.
 
-import torch
-import requests
+
+```python
 import PIL
+import requests
+import torch
+from io import BytesIO
 
 from diffusers import StableDiffusionInpaintPipeline
 
@@ -231,21 +233,15 @@ mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data
 init_image = download_image(img_url).resize((512, 512))
 mask_image = download_image(mask_url).resize((512, 512))
 
-device = "cuda"
-model_id_or_path = "CompVis/stable-diffusion-v1-4"
 pipe = StableDiffusionInpaintPipeline.from_pretrained(
-    model_id_or_path,
-    revision="fp16", 
+    "runwayml/stable-diffusion-inpainting",
+    revision="fp16",
     torch_dtype=torch.float16,
 )
-# or download via git clone https://huggingface.co/CompVis/stable-diffusion-v1-4
-# and pass `model_id_or_path="./stable-diffusion-v1-4"`.
-pipe = pipe.to(device)
-
-prompt = "a cat sitting on a bench"
-images = pipe(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
+pipe = pipe.to("cuda")
 
-images[0].save("cat_on_bench.png")
+prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
 ```
 
 ### Tweak prompts reusing seeds and latents
diff --git a/docs/source/api/pipelines/overview.mdx b/docs/source/api/pipelines/overview.mdx
index 7b2d89e849d6..3082f99665f2 100644
--- a/docs/source/api/pipelines/overview.mdx
+++ b/docs/source/api/pipelines/overview.mdx
@@ -151,10 +151,10 @@ You can generate your own latents to reproduce results, or tweak your prompt on
 The `StableDiffusionInpaintPipeline` lets you edit specific parts of an image by providing a mask and text prompt.
 
 ```python
-from io import BytesIO
-
-import requests
 import PIL
+import requests
+import torch
+from io import BytesIO
 
 from diffusers import StableDiffusionInpaintPipeline
 
@@ -170,15 +170,15 @@ mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data
 init_image = download_image(img_url).resize((512, 512))
 mask_image = download_image(mask_url).resize((512, 512))
 
-device = "cuda"
 pipe = StableDiffusionInpaintPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
-).to(device)
-
-prompt = "a cat sitting on a bench"
-images = pipe(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
+    "runwayml/stable-diffusion-inpainting",
+    revision="fp16",
+    torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
 
-images[0].save("cat_on_bench.png")
+prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
 ```
 
 You can also run this example on colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
diff --git a/docs/source/using-diffusers/inpaint.mdx b/docs/source/using-diffusers/inpaint.mdx
index 7b4687c21204..1bafa244559b 100644
--- a/docs/source/using-diffusers/inpaint.mdx
+++ b/docs/source/using-diffusers/inpaint.mdx
@@ -12,13 +12,19 @@ specific language governing permissions and limitations under the License.
 
 # Text-Guided Image-Inpainting
 
-The [`StableDiffusionInpaintPipeline`] lets you edit specific parts of an image by providing a mask and text prompt.
+The [`StableDiffusionInpaintPipeline`] lets you edit specific parts of an image by providing a mask and a text prompt. It uses a version of Stable Diffusion specifically trained for in-painting tasks.
 
-```python
-from io import BytesIO
+<Tip warning={true}>
+Note that this model is distributed separately from the regular Stable Diffusion model, so you have to accept its license even if you accepted the Stable Diffusion one in the past.
 
-import requests
+Please, visit the [model card](https://huggingface.co/runwayml/stable-diffusion-inpainting), read the license carefully and tick the checkbox if you agree. You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section](https://huggingface.co/docs/hub/security-tokens) of the documentation.
+</Tip>
+
+```python
 import PIL
+import requests
+import torch
+from io import BytesIO
 
 from diffusers import StableDiffusionInpaintPipeline
 
@@ -34,15 +40,24 @@ mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data
 init_image = download_image(img_url).resize((512, 512))
 mask_image = download_image(mask_url).resize((512, 512))
 
-device = "cuda"
 pipe = StableDiffusionInpaintPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
-).to(device)
+    "runwayml/stable-diffusion-inpainting",
+    revision="fp16",
+    torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
+```
 
-prompt = "a cat sitting on a bench"
-images = pipe(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
+`image`          | `mask_image` | `prompt` | **Output** |
+:-------------------------:|:-------------------------:|:-------------------------:|-------------------------:|
+<img src="https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" alt="drawing" width="250"/> | <img src="https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" alt="drawing" width="250"/> | ***Face of a yellow cat, high resolution, sitting on a park bench*** | <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/test.png" alt="drawing" width="250"/> |
 
-images[0].save("cat_on_bench.png")
-```
 
 You can also run this example on colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+
+<Tip warning={true}>
+A previous experimental implementation of in-painting used a different, lower-quality process. To ensure backwards compatibility, loading a pretrained pipeline that doesn't contain the new model will still apply the old in-painting method.
+</Tip>
\ No newline at end of file
diff --git a/src/diffusers/pipelines/README.md b/src/diffusers/pipelines/README.md
index b5ea112feafc..90752d69f25f 100644
--- a/src/diffusers/pipelines/README.md
+++ b/src/diffusers/pipelines/README.md
@@ -141,10 +141,10 @@ You can generate your own latents to reproduce results, or tweak your prompt on
 The `StableDiffusionInpaintPipeline` lets you edit specific parts of an image by providing a mask and text prompt.
 
 ```python
-from io import BytesIO
-
-import requests
 import PIL
+import requests
+import torch
+from io import BytesIO
 
 from diffusers import StableDiffusionInpaintPipeline
 
@@ -158,17 +158,15 @@ mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data
 init_image = download_image(img_url).resize((512, 512))
 mask_image = download_image(mask_url).resize((512, 512))
 
-device = "cuda"
 pipe = StableDiffusionInpaintPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
-    revision="fp16", 
+    "runwayml/stable-diffusion-inpainting",
+    revision="fp16",
     torch_dtype=torch.float16,
-).to(device)
-
-prompt = "a cat sitting on a bench"
-images = pipe(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
+)
+pipe = pipe.to("cuda")
 
-images[0].save("cat_on_bench.png")
+prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
 ```
 
 You can also run this example on colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)

From ad9d7ce4763f8fb2a9e620bff017830c26086c36 Mon Sep 17 00:00:00 2001
From: anton-l <anton@huggingface.co>
Date: Wed, 19 Oct 2022 17:38:55 +0200
Subject: [PATCH 013/269] Release: 0.6.0

---
 setup.py                  | 2 +-
 src/diffusers/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 25e434de2801..7532feb5a360 100644
--- a/setup.py
+++ b/setup.py
@@ -211,7 +211,7 @@ def run(self):
 
 setup(
     name="diffusers",
-    version="0.6.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="0.6.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     description="Diffusers",
     long_description=open("README.md", "r", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index f36d3284777e..02d98fb71c31 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -9,7 +9,7 @@
 )
 
 
-__version__ = "0.6.0.dev0"
+__version__ = "0.6.0"
 
 from .configuration_utils import ConfigMixin
 from .onnx_utils import OnnxRuntimeModel

From 2a0c823527694058d410ed6f91b52e7dd9f94ebe Mon Sep 17 00:00:00 2001
From: SkyTNT <SKYTNT@outlook.com>
Date: Thu, 20 Oct 2022 04:30:46 +0800
Subject: [PATCH 014/269] [Community Pipelines] Long Prompt Weighting Stable
 Diffusion Pipelines (#907)

* [Community Pipelines] Long Prompt Weighting

* Update README.md

* fix

* style

* fix style

* Update examples/community/README.md

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 examples/community/README.md                  |   60 +-
 examples/community/lpw_stable_diffusion.py    | 1035 +++++++++++++++++
 .../community/lpw_stable_diffusion_onnx.py    |  961 +++++++++++++++
 3 files changed, 2050 insertions(+), 6 deletions(-)
 create mode 100644 examples/community/lpw_stable_diffusion.py
 create mode 100644 examples/community/lpw_stable_diffusion_onnx.py

diff --git a/examples/community/README.md b/examples/community/README.md
index 9edbad7d8b6f..54a3093ad3a3 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -6,12 +6,13 @@
 Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste ready code example that you can try out.
 If a community doesn't work as expected, please open an issue and ping the author on it.
 
-| Example   |      Description      |      Code Example |  Colab |    Author |
-|:----------|:----------------------|:-----------------|:-------------|----------:|
-| CLIP Guided Stable Diffusion | Doing CLIP guidance for text to image generation with Stable Diffusion| [CLIP Guided Stable Diffusion](#clip-guided-stable-diffusion) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/CLIP_Guided_Stable_diffusion_with_diffusers.ipynb) | [Suraj Patil](https://github.com/patil-suraj/) | 
-| One Step U-Net (Dummy) | Example showcasing of how to use Community Pipelines (see https://github.com/huggingface/diffusers/issues/841) | [One Step U-Net](#one-step-unet) | - | [Patrick von Platen](https://github.com/patrickvonplaten/) |
-| Stable Diffusion Interpolation | Interpolate the latent space of Stable Diffusion between different prompts/seeds | [Stable Diffusion Interpolation](#stable-diffusion-interpolation) | - | [Nate Raw](https://github.com/nateraw/) |
-| Stable Diffusion Mega | **One** Stable Diffusion Pipeline with all functionalities of [Text2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py), [Image2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) and [Inpainting](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | [Stable Diffusion Mega](#stable-diffusion-mega) | - | [Patrick von Platen](https://github.com/patrickvonplaten/) |
+| Example                                | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | Code Example                                                      | Colab                                                                                                                                                                                                              |                                                     Author |
+|:---------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------:|
+| CLIP Guided Stable Diffusion           | Doing CLIP guidance for text to image generation with Stable Diffusion                                                                                                                                                                                                                                                                                                                                                                                                                                   | [CLIP Guided Stable Diffusion](#clip-guided-stable-diffusion)     | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/CLIP_Guided_Stable_diffusion_with_diffusers.ipynb) |             [Suraj Patil](https://github.com/patil-suraj/) | 
+| One Step U-Net (Dummy)                 | Example showcasing of how to use Community Pipelines (see https://github.com/huggingface/diffusers/issues/841)                                                                                                                                                                                                                                                                                                                                                                                           | [One Step U-Net](#one-step-unet)                                  | -                                                                                                                                                                                                                  | [Patrick von Platen](https://github.com/patrickvonplaten/) |
+| Stable Diffusion Interpolation         | Interpolate the latent space of Stable Diffusion between different prompts/seeds                                                                                                                                                                                                                                                                                                                                                                                                                         | [Stable Diffusion Interpolation](#stable-diffusion-interpolation) | -                                                                                                                                                                                                                  |                    [Nate Raw](https://github.com/nateraw/) |
+| Stable Diffusion Mega                  | **One** Stable Diffusion Pipeline with all functionalities of [Text2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py), [Image2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) and [Inpainting](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | [Stable Diffusion Mega](#stable-diffusion-mega)                   | -                                                                                                                                                                                                                  | [Patrick von Platen](https://github.com/patrickvonplaten/) |
+| Long Prompt Weighting Stable Diffusion | **One** Stable Diffusion Pipeline without tokens length limit, and support parsing weighting in prompt.                                                                                                                                                                                                                                                                                                                                                                                                  | [Long Prompt Weighting Stable Diffusion](#long-prompt-weighting-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [SkyTNT](https://github.com/SkyTNT) |
 
 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
 ```py
@@ -168,3 +169,50 @@ images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_imag
 
 As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.
 
+### Long Prompt Weighting Stable Diffusion
+
+The Pipeline lets you input prompt without 77 token length limit. And you can increase words weighting by using "()" or decrease words weighting by using "[]"
+The Pipeline also lets you use the main use cases of the stable diffusion pipeline in a single class.
+
+#### pytorch
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipe = DiffusionPipeline.from_pretrained(
+    'hakurei/waifu-diffusion',
+    custom_pipeline="lpw_stable_diffusion",
+    revision="fp16",
+    torch_dtype=torch.float16
+)
+pipe=pipe.to("cuda")
+
+prompt = "best_quality (1girl:1.3) bow bride brown_hair closed_mouth frilled_bow frilled_hair_tubes frills (full_body:1.3) fox_ear hair_bow hair_tubes happy hood japanese_clothes kimono long_sleeves red_bow smile solo tabi uchikake white_kimono wide_sleeves cherry_blossoms"
+neg_prompt = "lowres, bad_anatomy, error_body, error_hair, error_arm, error_hands, bad_hands, error_fingers, bad_fingers, missing_fingers, error_legs, bad_legs, multiple_legs, missing_legs, error_lighting, error_shadow, error_reflection, text, error, extra_digit, fewer_digits, cropped, worst_quality, low_quality, normal_quality, jpeg_artifacts, signature, watermark, username, blurry"
+
+pipe.text2img(prompt, negative_prompt=neg_prompt, width=512,height=512,max_embeddings_multiples=3).images[0]
+
+```
+
+#### onnxruntime
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipe = DiffusionPipeline.from_pretrained(
+    'CompVis/stable-diffusion-v1-4',
+    custom_pipeline="lpw_stable_diffusion_onnx",
+    revision="onnx",
+    provider="CUDAExecutionProvider"
+)
+
+prompt = "a photo of an astronaut riding a horse on mars, best quality"
+neg_prompt = "lowres, bad anatomy, error body, error hair, error arm, error hands, bad hands, error fingers, bad fingers, missing fingers, error legs, bad legs, multiple legs, missing legs, error lighting, error shadow, error reflection, text, error, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry"
+
+pipe.text2img(prompt,negative_prompt=neg_prompt, width=512, height=512, max_embeddings_multiples=3).images[0]
+
+```
+
+if you see `Token indices sequence length is longer than the specified maximum sequence length for this model ( *** > 77 ) . Running this sequence through the model will result in indexing errors`. Do not worry, it is normal.
diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py
new file mode 100644
index 000000000000..3d4ec23e3aea
--- /dev/null
+++ b/examples/community/lpw_stable_diffusion.py
@@ -0,0 +1,1035 @@
+import inspect
+import re
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+
+import PIL
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import deprecate, logging
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+re_attention = re.compile(
+    r"""
+\\\(|
+\\\)|
+\\\[|
+\\]|
+\\\\|
+\\|
+\(|
+\[|
+:([+-]?[.\d]+)\)|
+\)|
+]|
+[^\\()\[\]:]+|
+:
+""",
+    re.X,
+)
+
+
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs: text and its assoicated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \( - literal character '('
+      \[ - literal character '['
+      \) - literal character ')'
+      \] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\(literal\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
+
+    res = []
+    round_brackets = []
+    square_brackets = []
+
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+
+        if text.startswith("\\"):
+            res.append([text[1:], 1.0])
+        elif text == "(":
+            round_brackets.append(len(res))
+        elif text == "[":
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ")" and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == "]" and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            res.append([text, 1.0])
+
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+
+    if len(res) == 0:
+        res = [["", 1.0]]
+
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+
+    return res
+
+
+def get_prompts_with_weights(pipe: DiffusionPipeline, prompt: List[str], max_length: int):
+    r"""
+    Tokenize a list of prompts and return its tokens with weights of each token.
+
+    No padding, starting or ending token is included.
+    """
+    tokens = []
+    weights = []
+    for text in prompt:
+        texts_and_weights = parse_prompt_attention(text)
+        text_token = []
+        text_weight = []
+        for word, weight in texts_and_weights:
+            # tokenize and discard the starting and the ending token
+            token = pipe.tokenizer(word).input_ids[1:-1]
+            text_token += token
+
+            # copy the weight by length of token
+            text_weight += [weight] * len(token)
+
+            # stop if the text is too long (longer than truncation limit)
+            if len(text_token) > max_length:
+                break
+
+        # truncate
+        if len(text_token) > max_length:
+            text_token = text_token[:max_length]
+            text_weight = text_weight[:max_length]
+
+        tokens.append(text_token)
+        weights.append(text_weight)
+    return tokens, weights
+
+
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, no_boseos_middle=True, chunk_length=77):
+    r"""
+    Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
+    """
+    max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
+    weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
+    for i in range(len(tokens)):
+        tokens[i] = [bos] + tokens[i] + [eos] * (max_length - 1 - len(tokens[i]))
+        if no_boseos_middle:
+            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
+        else:
+            w = []
+            if len(weights[i]) == 0:
+                w = [1.0] * weights_length
+            else:
+                for j in range((len(weights[i]) - 1) // chunk_length + 1):
+                    w.append(1.0)  # weight for starting token in this chunk
+                    w += weights[i][j * chunk_length : min(len(weights[i]), (j + 1) * chunk_length)]
+                    w.append(1.0)  # weight for ending token in this chunk
+                w += [1.0] * (weights_length - len(w))
+            weights[i] = w[:]
+
+    return tokens, weights
+
+
+def get_unweighted_text_embeddings(
+    pipe: DiffusionPipeline, text_input: torch.Tensor, chunk_length: int, no_boseos_middle: Optional[bool] = True
+):
+    """
+    When the length of tokens is a multiple of the capacity of the text encoder,
+    it should be split into chunks and sent to the text encoder individually.
+    """
+    max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
+    if max_embeddings_multiples > 1:
+        text_embeddings = []
+        for i in range(max_embeddings_multiples):
+            # extract the i-th chunk
+            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
+
+            # cover the head and the tail by the starting and the ending tokens
+            text_input_chunk[:, 0] = text_input[0, 0]
+            text_input_chunk[:, -1] = text_input[0, -1]
+            text_embedding = pipe.text_encoder(text_input_chunk)[0]
+
+            if no_boseos_middle:
+                if i == 0:
+                    # discard the ending token
+                    text_embedding = text_embedding[:, :-1]
+                elif i == max_embeddings_multiples - 1:
+                    # discard the starting token
+                    text_embedding = text_embedding[:, 1:]
+                else:
+                    # discard both starting and ending tokens
+                    text_embedding = text_embedding[:, 1:-1]
+
+            text_embeddings.append(text_embedding)
+        text_embeddings = torch.concat(text_embeddings, axis=1)
+    else:
+        text_embeddings = pipe.text_encoder(text_input)[0]
+    return text_embeddings
+
+
+def get_weighted_text_embeddings(
+    pipe: DiffusionPipeline,
+    prompt: Union[str, List[str]],
+    uncond_prompt: Optional[Union[str, List[str]]] = None,
+    max_embeddings_multiples: Optional[int] = 1,
+    no_boseos_middle: Optional[bool] = False,
+    skip_parsing: Optional[bool] = False,
+    skip_weighting: Optional[bool] = False,
+    **kwargs,
+):
+    r"""
+    Prompts can be assigned with local weights using brackets. For example,
+    prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
+    and the embedding tokens corresponding to the words get multipled by a constant, 1.1.
+
+    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the origional mean.
+
+    Args:
+        pipe (`DiffusionPipeline`):
+            Pipe to provide access to the tokenizer and the text encoder.
+        prompt (`str` or `List[str]`):
+            The prompt or prompts to guide the image generation.
+        uncond_prompt (`str` or `List[str]`):
+            The unconditional prompt or prompts for guide the image generation. If unconditional prompt
+            is provided, the embeddings of prompt and uncond_prompt are concatenated.
+        max_embeddings_multiples (`int`, *optional*, defaults to `1`):
+            The max multiple length of prompt embeddings compared to the max output length of text encoder.
+        no_boseos_middle (`bool`, *optional*, defaults to `False`):
+            If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
+            ending token in each of the chunk in the middle.
+        skip_parsing (`bool`, *optional*, defaults to `False`):
+            Skip the parsing of brackets.
+        skip_weighting (`bool`, *optional*, defaults to `False`):
+            Skip the weighting. When the parsing is skipped, it is forced True.
+    """
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+    if isinstance(prompt, str):
+        prompt = [prompt]
+
+    if not skip_parsing:
+        prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
+        if uncond_prompt is not None:
+            if isinstance(uncond_prompt, str):
+                uncond_prompt = [uncond_prompt]
+            uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
+    else:
+        prompt_tokens = [
+            token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids
+        ]
+        prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
+        if uncond_prompt is not None:
+            if isinstance(uncond_prompt, str):
+                uncond_prompt = [uncond_prompt]
+            uncond_tokens = [
+                token[1:-1]
+                for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids
+            ]
+            uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
+
+    # round up the longest length of tokens to a multiple of (model_max_length - 2)
+    max_length = max([len(token) for token in prompt_tokens])
+    if uncond_prompt is not None:
+        max_length = max(max_length, max([len(token) for token in uncond_tokens]))
+
+    max_embeddings_multiples = min(
+        max_embeddings_multiples, (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1
+    )
+    max_embeddings_multiples = max(1, max_embeddings_multiples)
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+
+    # pad the length of tokens and weights
+    bos = pipe.tokenizer.bos_token_id
+    eos = pipe.tokenizer.eos_token_id
+    prompt_tokens, prompt_weights = pad_tokens_and_weights(
+        prompt_tokens,
+        prompt_weights,
+        max_length,
+        bos,
+        eos,
+        no_boseos_middle=no_boseos_middle,
+        chunk_length=pipe.tokenizer.model_max_length,
+    )
+    prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device=pipe.device)
+    if uncond_prompt is not None:
+        uncond_tokens, uncond_weights = pad_tokens_and_weights(
+            uncond_tokens,
+            uncond_weights,
+            max_length,
+            bos,
+            eos,
+            no_boseos_middle=no_boseos_middle,
+            chunk_length=pipe.tokenizer.model_max_length,
+        )
+        uncond_tokens = torch.tensor(uncond_tokens, dtype=torch.long, device=pipe.device)
+
+    # get the embeddings
+    text_embeddings = get_unweighted_text_embeddings(
+        pipe, prompt_tokens, pipe.tokenizer.model_max_length, no_boseos_middle=no_boseos_middle
+    )
+    prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=pipe.device)
+    if uncond_prompt is not None:
+        uncond_embeddings = get_unweighted_text_embeddings(
+            pipe, uncond_tokens, pipe.tokenizer.model_max_length, no_boseos_middle=no_boseos_middle
+        )
+        uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=pipe.device)
+
+    # assign weights to the prompts and normalize in the sense of mean
+    # TODO: should we normalize by chunk or in a whole (current implementation)?
+    if (not skip_parsing) and (not skip_weighting):
+        previous_mean = text_embeddings.mean(axis=[-2, -1])
+        text_embeddings *= prompt_weights.unsqueeze(-1)
+        text_embeddings *= (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
+        if uncond_prompt is not None:
+            previous_mean = uncond_embeddings.mean(axis=[-2, -1])
+            uncond_embeddings *= uncond_weights.unsqueeze(-1)
+            uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
+
+    if uncond_prompt is not None:
+        return text_embeddings, uncond_embeddings
+    return text_embeddings, None
+
+
+def preprocess_image(image):
+    w, h = image.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+
+def preprocess_mask(mask):
+    mask = mask.convert("L")
+    w, h = mask.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST)
+    mask = np.array(mask).astype(np.float32) / 255.0
+    mask = np.tile(mask, (4, 1, 1))
+    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
+    mask = 1 - mask  # repaint white, keep black
+    mask = torch.from_numpy(mask)
+    return mask
+
+
+class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
+    weighting in prompt.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warn(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        init_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        strength: float = 0.8,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            init_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
+                `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        if isinstance(prompt, str):
+            batch_size = 1
+            prompt = [prompt]
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # get prompt text embeddings
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if negative_prompt is None:
+            negative_prompt = [""] * batch_size
+        elif isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt] * batch_size
+        if batch_size != len(negative_prompt):
+            raise ValueError(
+                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                " the batch size of `prompt`."
+            )
+
+        text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
+            pipe=self,
+            prompt=prompt,
+            uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
+            max_embeddings_multiples=max_embeddings_multiples,
+            **kwargs,
+        )
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            bs_embed, seq_len, _ = uncond_embeddings.shape
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        latents_dtype = text_embeddings.dtype
+        init_latents_orig = None
+        mask = None
+        noise = None
+
+        if init_image is None:
+            # get the initial random noise unless the user supplied it
+
+            # Unlike in other pipelines, latents need to be generated in the target device
+            # for 1-to-1 results reproducibility with the CompVis implementation.
+            # However this currently doesn't work in `mps`.
+            latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
+
+            if latents is None:
+                if self.device.type == "mps":
+                    # randn does not exist on mps
+                    latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                        self.device
+                    )
+                else:
+                    latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+            else:
+                if latents.shape != latents_shape:
+                    raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+                latents = latents.to(self.device)
+
+            timesteps = self.scheduler.timesteps.to(self.device)
+
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+        else:
+            if isinstance(init_image, PIL.Image.Image):
+                init_image = preprocess_image(init_image)
+            # encode the init image into latents and scale the latents
+            init_image = init_image.to(device=self.device, dtype=latents_dtype)
+            init_latent_dist = self.vae.encode(init_image).latent_dist
+            init_latents = init_latent_dist.sample(generator=generator)
+            init_latents = 0.18215 * init_latents
+            init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
+            init_latents_orig = init_latents
+
+            # preprocess mask
+            if mask_image is not None:
+                if isinstance(mask_image, PIL.Image.Image):
+                    mask_image = preprocess_mask(mask_image)
+                mask_image = mask_image.to(device=self.device, dtype=latents_dtype)
+                mask = torch.cat([mask_image] * batch_size * num_images_per_prompt)
+
+                # check sizes
+                if not mask.shape == init_latents.shape:
+                    raise ValueError("The mask and init_image should be the same size!")
+
+            # get the original timestep using init_timestep
+            offset = self.scheduler.config.get("steps_offset", 0)
+            init_timestep = int(num_inference_steps * strength) + offset
+            init_timestep = min(init_timestep, num_inference_steps)
+
+            timesteps = self.scheduler.timesteps[-init_timestep]
+            timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt, device=self.device)
+
+            # add noise to latents using the timesteps
+            if self.device.type == "mps":
+                # randn does not exist on mps
+                noise = torch.randn(init_latents.shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                noise = torch.randn(init_latents.shape, generator=generator, device=self.device, dtype=latents_dtype)
+            latents = self.scheduler.add_noise(init_latents, noise, timesteps)
+
+            t_start = max(num_inference_steps - init_timestep + offset, 0)
+            timesteps = self.scheduler.timesteps[t_start:].to(self.device)
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            if mask is not None:
+                # masking
+                init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+                latents = (init_latents_proper * mask) + (latents * (1 - mask))
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
+                self.device
+            )
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+            )
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    def text2img(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function for text-to-image generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            **kwargs,
+        )
+
+    def img2img(
+        self,
+        init_image: Union[torch.FloatTensor, PIL.Image.Image],
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function for image-to-image generation.
+        Args:
+            init_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
+                `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter will be modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            init_image=init_image,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            strength=strength,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            **kwargs,
+        )
+
+    def inpaint(
+        self,
+        init_image: Union[torch.FloatTensor, PIL.Image.Image],
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image],
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function for inpaint.
+        Args:
+            init_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. This is the image whose masked region will be inpainted.
+            mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
+                is 1, the denoising process will be run on the masked area for the full number of iterations specified
+                in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
+                noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
+                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            init_image=init_image,
+            mask_image=mask_image,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            strength=strength,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            **kwargs,
+        )
diff --git a/examples/community/lpw_stable_diffusion_onnx.py b/examples/community/lpw_stable_diffusion_onnx.py
new file mode 100644
index 000000000000..b5b9a2a65fb0
--- /dev/null
+++ b/examples/community/lpw_stable_diffusion_onnx.py
@@ -0,0 +1,961 @@
+import inspect
+import re
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+
+import PIL
+from diffusers.onnx_utils import OnnxRuntimeModel
+from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import logging
+from transformers import CLIPFeatureExtractor, CLIPTokenizer
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+re_attention = re.compile(
+    r"""
+\\\(|
+\\\)|
+\\\[|
+\\]|
+\\\\|
+\\|
+\(|
+\[|
+:([+-]?[.\d]+)\)|
+\)|
+]|
+[^\\()\[\]:]+|
+:
+""",
+    re.X,
+)
+
+
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs: text and its assoicated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \( - literal character '('
+      \[ - literal character '['
+      \) - literal character ')'
+      \] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\(literal\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
+
+    res = []
+    round_brackets = []
+    square_brackets = []
+
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+
+        if text.startswith("\\"):
+            res.append([text[1:], 1.0])
+        elif text == "(":
+            round_brackets.append(len(res))
+        elif text == "[":
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ")" and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == "]" and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            res.append([text, 1.0])
+
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+
+    if len(res) == 0:
+        res = [["", 1.0]]
+
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+
+    return res
+
+
+def get_prompts_with_weights(pipe, prompt: List[str], max_length: int):
+    r"""
+    Tokenize a list of prompts and return its tokens with weights of each token.
+
+    No padding, starting or ending token is included.
+    """
+    tokens = []
+    weights = []
+    for text in prompt:
+        texts_and_weights = parse_prompt_attention(text)
+        text_token = []
+        text_weight = []
+        for word, weight in texts_and_weights:
+            # tokenize and discard the starting and the ending token
+            token = pipe.tokenizer(word, return_tensors="np").input_ids[0, 1:-1]
+            text_token += list(token)
+
+            # copy the weight by length of token
+            text_weight += [weight] * len(token)
+
+            # stop if the text is too long (longer than truncation limit)
+            if len(text_token) > max_length:
+                break
+
+        # truncate
+        if len(text_token) > max_length:
+            text_token = text_token[:max_length]
+            text_weight = text_weight[:max_length]
+
+        tokens.append(text_token)
+        weights.append(text_weight)
+    return tokens, weights
+
+
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, no_boseos_middle=True, chunk_length=77):
+    r"""
+    Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
+    """
+    max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
+    weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
+    for i in range(len(tokens)):
+        tokens[i] = [bos] + tokens[i] + [eos] * (max_length - 1 - len(tokens[i]))
+        if no_boseos_middle:
+            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
+        else:
+            w = []
+            if len(weights[i]) == 0:
+                w = [1.0] * weights_length
+            else:
+                for j in range((len(weights[i]) - 1) // chunk_length + 1):
+                    w.append(1.0)  # weight for starting token in this chunk
+                    w += weights[i][j * chunk_length : min(len(weights[i]), (j + 1) * chunk_length)]
+                    w.append(1.0)  # weight for ending token in this chunk
+                w += [1.0] * (weights_length - len(w))
+            weights[i] = w[:]
+
+    return tokens, weights
+
+
+def get_unweighted_text_embeddings(
+    pipe, text_input: np.array, chunk_length: int, no_boseos_middle: Optional[bool] = True
+):
+    """
+    When the length of tokens is a multiple of the capacity of the text encoder,
+    it should be split into chunks and sent to the text encoder individually.
+    """
+    max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
+    if max_embeddings_multiples > 1:
+        text_embeddings = []
+        for i in range(max_embeddings_multiples):
+            # extract the i-th chunk
+            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].copy()
+
+            # cover the head and the tail by the starting and the ending tokens
+            text_input_chunk[:, 0] = text_input[0, 0]
+            text_input_chunk[:, -1] = text_input[0, -1]
+
+            text_embedding = pipe.text_encoder(input_ids=text_input_chunk)[0]
+
+            if no_boseos_middle:
+                if i == 0:
+                    # discard the ending token
+                    text_embedding = text_embedding[:, :-1]
+                elif i == max_embeddings_multiples - 1:
+                    # discard the starting token
+                    text_embedding = text_embedding[:, 1:]
+                else:
+                    # discard both starting and ending tokens
+                    text_embedding = text_embedding[:, 1:-1]
+
+            text_embeddings.append(text_embedding)
+        text_embeddings = np.concatenate(text_embeddings, axis=1)
+    else:
+        text_embeddings = pipe.text_encoder(input_ids=text_input)[0]
+    return text_embeddings
+
+
+def get_weighted_text_embeddings(
+    pipe,
+    prompt: Union[str, List[str]],
+    uncond_prompt: Optional[Union[str, List[str]]] = None,
+    max_embeddings_multiples: Optional[int] = 4,
+    no_boseos_middle: Optional[bool] = False,
+    skip_parsing: Optional[bool] = False,
+    skip_weighting: Optional[bool] = False,
+    **kwargs,
+):
+    r"""
+    Prompts can be assigned with local weights using brackets. For example,
+    prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
+    and the embedding tokens corresponding to the words get multipled by a constant, 1.1.
+
+    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the origional mean.
+
+    Args:
+        pipe (`DiffusionPipeline`):
+            Pipe to provide access to the tokenizer and the text encoder.
+        prompt (`str` or `List[str]`):
+            The prompt or prompts to guide the image generation.
+        uncond_prompt (`str` or `List[str]`):
+            The unconditional prompt or prompts for guide the image generation. If unconditional prompt
+            is provided, the embeddings of prompt and uncond_prompt are concatenated.
+        max_embeddings_multiples (`int`, *optional*, defaults to `1`):
+            The max multiple length of prompt embeddings compared to the max output length of text encoder.
+        no_boseos_middle (`bool`, *optional*, defaults to `False`):
+            If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
+            ending token in each of the chunk in the middle.
+        skip_parsing (`bool`, *optional*, defaults to `False`):
+            Skip the parsing of brackets.
+        skip_weighting (`bool`, *optional*, defaults to `False`):
+            Skip the weighting. When the parsing is skipped, it is forced True.
+    """
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+    if isinstance(prompt, str):
+        prompt = [prompt]
+
+    if not skip_parsing:
+        prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
+        if uncond_prompt is not None:
+            if isinstance(uncond_prompt, str):
+                uncond_prompt = [uncond_prompt]
+            uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
+    else:
+        prompt_tokens = [
+            token[1:-1]
+            for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True, return_tensors="np").input_ids
+        ]
+        prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
+        if uncond_prompt is not None:
+            if isinstance(uncond_prompt, str):
+                uncond_prompt = [uncond_prompt]
+            uncond_tokens = [
+                token[1:-1]
+                for token in pipe.tokenizer(
+                    uncond_prompt, max_length=max_length, truncation=True, return_tensors="np"
+                ).input_ids
+            ]
+            uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
+
+    # round up the longest length of tokens to a multiple of (model_max_length - 2)
+    max_length = max([len(token) for token in prompt_tokens])
+    if uncond_prompt is not None:
+        max_length = max(max_length, max([len(token) for token in uncond_tokens]))
+
+    max_embeddings_multiples = min(
+        max_embeddings_multiples, (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1
+    )
+    max_embeddings_multiples = max(1, max_embeddings_multiples)
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+
+    # pad the length of tokens and weights
+    bos = pipe.tokenizer.bos_token_id
+    eos = pipe.tokenizer.eos_token_id
+    prompt_tokens, prompt_weights = pad_tokens_and_weights(
+        prompt_tokens,
+        prompt_weights,
+        max_length,
+        bos,
+        eos,
+        no_boseos_middle=no_boseos_middle,
+        chunk_length=pipe.tokenizer.model_max_length,
+    )
+    prompt_tokens = np.array(prompt_tokens, dtype=np.int32)
+    if uncond_prompt is not None:
+        uncond_tokens, uncond_weights = pad_tokens_and_weights(
+            uncond_tokens,
+            uncond_weights,
+            max_length,
+            bos,
+            eos,
+            no_boseos_middle=no_boseos_middle,
+            chunk_length=pipe.tokenizer.model_max_length,
+        )
+        uncond_tokens = np.array(uncond_tokens, dtype=np.int32)
+
+    # get the embeddings
+    text_embeddings = get_unweighted_text_embeddings(
+        pipe, prompt_tokens, pipe.tokenizer.model_max_length, no_boseos_middle=no_boseos_middle
+    )
+    prompt_weights = np.array(prompt_weights, dtype=text_embeddings.dtype)
+    if uncond_prompt is not None:
+        uncond_embeddings = get_unweighted_text_embeddings(
+            pipe, uncond_tokens, pipe.tokenizer.model_max_length, no_boseos_middle=no_boseos_middle
+        )
+        uncond_weights = np.array(uncond_weights, dtype=uncond_embeddings.dtype)
+
+    # assign weights to the prompts and normalize in the sense of mean
+    # TODO: should we normalize by chunk or in a whole (current implementation)?
+    if (not skip_parsing) and (not skip_weighting):
+        previous_mean = text_embeddings.mean(axis=(-2, -1))
+        text_embeddings *= prompt_weights[:, :, None]
+        text_embeddings *= (previous_mean / text_embeddings.mean(axis=(-2, -1)))[:, None, None]
+        if uncond_prompt is not None:
+            previous_mean = uncond_embeddings.mean(axis=(-2, -1))
+            uncond_embeddings *= uncond_weights[:, :, None]
+            uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=(-2, -1)))[:, None, None]
+
+    # For classifier free guidance, we need to do two forward passes.
+    # Here we concatenate the unconditional and text embeddings into a single batch
+    # to avoid doing two forward passes
+    if uncond_prompt is not None:
+        return text_embeddings, uncond_embeddings
+
+    return text_embeddings
+
+
+def preprocess_image(image):
+    w, h = image.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    return 2.0 * image - 1.0
+
+
+def preprocess_mask(mask):
+    mask = mask.convert("L")
+    w, h = mask.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST)
+    mask = np.array(mask).astype(np.float32) / 255.0
+    mask = np.tile(mask, (4, 1, 1))
+    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
+    mask = 1 - mask  # repaint white, keep black
+    return mask
+
+
+class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
+    weighting in prompt.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    """
+
+    def __init__(
+        self,
+        vae_encoder: OnnxRuntimeModel,
+        vae_decoder: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: OnnxRuntimeModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: OnnxRuntimeModel,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae_encoder=vae_encoder,
+            vae_decoder=vae_decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        init_image: Union[np.ndarray, PIL.Image.Image] = None,
+        mask_image: Union[np.ndarray, PIL.Image.Image] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        strength: float = 0.8,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        latents: Optional[np.ndarray] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            init_image (`np.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            mask_image (`np.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
+                `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
+            latents (`np.ndarray`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        if isinstance(prompt, str):
+            batch_size = 1
+            prompt = [prompt]
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # get prompt text embeddings
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if negative_prompt is None:
+            negative_prompt = [""] * batch_size
+        elif isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt] * batch_size
+        if batch_size != len(negative_prompt):
+            raise ValueError(
+                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                " the batch size of `prompt`."
+            )
+
+        if generator is None:
+            generator = np.random
+
+        text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
+            pipe=self,
+            prompt=prompt,
+            uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
+            max_embeddings_multiples=max_embeddings_multiples,
+            **kwargs,
+        )
+
+        text_embeddings = text_embeddings.repeat(num_images_per_prompt, 0)
+        if do_classifier_free_guidance:
+            uncond_embeddings = uncond_embeddings.repeat(num_images_per_prompt, 0)
+            text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        latents_dtype = text_embeddings.dtype
+        init_latents_orig = None
+        mask = None
+        noise = None
+
+        if init_image is None:
+            latents_shape = (batch_size * num_images_per_prompt, 4, height // 8, width // 8)
+
+            if latents is None:
+                latents = generator.randn(*latents_shape).astype(latents_dtype)
+            elif latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+
+            timesteps = self.scheduler.timesteps.to(self.device)
+
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+        else:
+            if isinstance(init_image, PIL.Image.Image):
+                init_image = preprocess_image(init_image)
+            # encode the init image into latents and scale the latents
+            init_image = init_image.astype(latents_dtype)
+            init_latents = self.vae_encoder(sample=init_image)[0]
+            init_latents = 0.18215 * init_latents
+            init_latents = np.concatenate([init_latents] * batch_size * num_images_per_prompt)
+            init_latents_orig = init_latents
+
+            # preprocess mask
+            if mask_image is not None:
+                if isinstance(mask_image, PIL.Image.Image):
+                    mask_image = preprocess_mask(mask_image)
+                mask_image = mask_image.astype(latents_dtype)
+                mask = np.concatenate([mask_image] * batch_size * num_images_per_prompt)
+
+                # check sizes
+                if not mask.shape == init_latents.shape:
+                    print(mask.shape, init_latents.shape)
+                    raise ValueError("The mask and init_image should be the same size!")
+
+            # get the original timestep using init_timestep
+            offset = self.scheduler.config.get("steps_offset", 0)
+            init_timestep = int(num_inference_steps * strength) + offset
+            init_timestep = min(init_timestep, num_inference_steps)
+
+            timesteps = self.scheduler.timesteps[-init_timestep]
+            timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt)
+
+            # add noise to latents using the timesteps
+            noise = generator.randn(*init_latents.shape).astype(latents_dtype)
+            latents = self.scheduler.add_noise(
+                torch.from_numpy(init_latents), torch.from_numpy(noise), timesteps
+            ).numpy()
+
+            t_start = max(num_inference_steps - init_timestep + offset, 0)
+            timesteps = self.scheduler.timesteps[t_start:]
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(
+                sample=latent_model_input, timestep=np.array([t]), encoder_hidden_states=text_embeddings
+            )
+            noise_pred = noise_pred[0]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample.numpy()
+
+            if mask is not None:
+                # masking
+                init_latents_proper = self.scheduler.add_noise(
+                    torch.from_numpy(init_latents_orig), torch.from_numpy(noise), torch.tensor([t])
+                ).numpy()
+                latents = (init_latents_proper * mask) + (latents * (1 - mask))
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        # image = self.vae_decoder(latent_sample=latents)[0]
+        # it seems likes there is a problem for using half-precision vae decoder if batchsize>1
+        image = []
+        for i in range(latents.shape[0]):
+            image.append(self.vae_decoder(latent_sample=latents[i : i + 1])[0])
+        image = np.concatenate(image)
+
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = image.transpose((0, 2, 3, 1))
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="np"
+            ).pixel_values.astype(image.dtype)
+            # There will throw an error if use safety_checker directly and batchsize>1
+            images, has_nsfw_concept = [], []
+            for i in range(image.shape[0]):
+                image_i, has_nsfw_concept_i = self.safety_checker(
+                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+                )
+                images.append(image_i)
+                has_nsfw_concept.append(has_nsfw_concept_i)
+            image = np.concatenate(images)
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    def text2img(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        latents: Optional[np.ndarray] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function for text-to-image generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
+            latents (`np.ndarray`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            **kwargs,
+        )
+
+    def img2img(
+        self,
+        init_image: Union[np.ndarray, PIL.Image.Image],
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function for image-to-image generation.
+        Args:
+            init_image (`np.ndarray` or `PIL.Image.Image`):
+                `Image`, or ndarray representing an image batch, that will be used as the starting point for the
+                process.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
+                `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter will be modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            init_image=init_image,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            strength=strength,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            **kwargs,
+        )
+
+    def inpaint(
+        self,
+        init_image: Union[np.ndarray, PIL.Image.Image],
+        mask_image: Union[np.ndarray, PIL.Image.Image],
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function for inpaint.
+        Args:
+            init_image (`np.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. This is the image whose masked region will be inpainted.
+            mask_image (`np.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
+                is 1, the denoising process will be run on the masked area for the full number of iterations specified
+                in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
+                noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
+                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            init_image=init_image,
+            mask_image=mask_image,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            strength=strength,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+            **kwargs,
+        )

From 83f8a5ff70d9305735b23131ed2015d3db0e7422 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 20 Oct 2022 13:28:11 +0200
Subject: [PATCH 015/269] [Stable Diffusion] Add components function (#889)

* [Stable Diffusion] Add components function

* uP
---
 docs/source/api/diffusion_pipeline.mdx        |  3 ++
 .../source/api/pipelines/stable_diffusion.mdx | 20 +++++++
 src/diffusers/pipeline_utils.py               | 37 ++++++++++++-
 tests/test_pipelines.py                       | 53 +++++++++++++++++++
 4 files changed, 112 insertions(+), 1 deletion(-)

diff --git a/docs/source/api/diffusion_pipeline.mdx b/docs/source/api/diffusion_pipeline.mdx
index 6a0f758f7694..b037b4e26dc1 100644
--- a/docs/source/api/diffusion_pipeline.mdx
+++ b/docs/source/api/diffusion_pipeline.mdx
@@ -32,6 +32,9 @@ Any pipeline object can be saved locally with [`~DiffusionPipeline.save_pretrain
 [[autodoc]] DiffusionPipeline
 	- from_pretrained
 	- save_pretrained
+	- to
+	- device
+	- components
 
 ## ImagePipelineOutput
 By default diffusion pipelines return an object of class
diff --git a/docs/source/api/pipelines/stable_diffusion.mdx b/docs/source/api/pipelines/stable_diffusion.mdx
index 3288c679de95..8fb0e470b479 100644
--- a/docs/source/api/pipelines/stable_diffusion.mdx
+++ b/docs/source/api/pipelines/stable_diffusion.mdx
@@ -17,6 +17,26 @@ For more details about how Stable Diffusion works and how it differs from the ba
 | [pipeline_stable_diffusion_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) | *Image-to-Image Text-Guided Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb) | [🤗 Diffuse the Rest](https://huggingface.co/spaces/huggingface/diffuse-the-rest)
 | [pipeline_stable_diffusion_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | **Experimental** – *Text-Guided Image Inpainting* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb) | Coming soon
 
+## Tips
+
+If you want to use all possible use cases in a single `DiffusionPipeline` you can either:
+- Make use of the [Stable Diffusion Mega Pipeline](https://github.com/huggingface/diffusers/tree/main/examples/community#stable-diffusion-mega) or 
+- Make use of the `components` functionality to instantiate all components in the most memory-efficient way:
+
+```python
+>>> from diffusers import (
+...     StableDiffusionPipeline,
+...     StableDiffusionImg2ImgPipeline,
+...     StableDiffusionInpaintPipeline,
+... )
+
+>>> img2text = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+>>> img2img = StableDiffusionImg2ImgPipeline(**img2text.components)
+>>> inpaint = StableDiffusionInpaintPipeline(**img2text.components)
+
+>>> # now you can use img2text(...), img2img(...), inpaint(...) just like the call methods of each respective pipeline
+```
+
 ## StableDiffusionPipelineOutput
 [[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
 
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 37bc228b10e6..1c787a41ae89 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -18,7 +18,7 @@
 import inspect
 import os
 from dataclasses import dataclass
-from typing import List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
 import torch
@@ -561,6 +561,41 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         model = pipeline_class(**init_kwargs)
         return model
 
+    @property
+    def components(self) -> Dict[str, Any]:
+        r"""
+
+        The `self.compenents` property can be useful to run different pipelines with the same weights and
+        configurations to not have to re-allocate memory.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import (
+        ...     StableDiffusionPipeline,
+        ...     StableDiffusionImg2ImgPipeline,
+        ...     StableDiffusionInpaintPipeline,
+        ... )
+
+        >>> img2text = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        >>> img2img = StableDiffusionImg2ImgPipeline(**img2text.components)
+        >>> inpaint = StableDiffusionInpaintPipeline(**img2text.components)
+        ```
+
+        Returns:
+            A dictionaly containing all the modules needed to initialize the pipleline.
+        """
+        components = {k: getattr(self, k) for k in self.config.keys() if not k.startswith("_")}
+        expected_modules = set(inspect.signature(self.__init__).parameters.keys()) - set(["self"])
+
+        if set(components.keys()) != expected_modules:
+            raise ValueError(
+                f"{self} has been incorrectly initialized or {self.__class__} is incorrectly implemented. Expected"
+                f" {expected_modules} to be defined, but {components} are defined."
+            )
+
+        return components
+
     @staticmethod
     def numpy_to_pil(images):
         """
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 357fb11a0e8c..1de2e6f5634f 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -1391,6 +1391,59 @@ def test_stable_diffusion_inpaint_fp16(self):
 
         assert image.shape == (1, 128, 128, 3)
 
+    def test_components(self):
+        """Test that components property works correctly"""
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
+
+        # make sure here that pndm scheduler skips prk
+        inpaint = StableDiffusionInpaintPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=self.dummy_safety_checker,
+            feature_extractor=self.dummy_extractor,
+        )
+        img2img = StableDiffusionImg2ImgPipeline(**inpaint.components)
+        text2img = StableDiffusionPipeline(**inpaint.components)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image_inpaint = inpaint(
+            [prompt],
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+            mask_image=mask_image,
+        ).images
+        image_img2img = img2img(
+            [prompt],
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+        ).images
+        image_text2img = text2img(
+            [prompt],
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+        ).images
+
+        assert image_inpaint.shape == (1, 32, 32, 3)
+        assert image_img2img.shape == (1, 32, 32, 3)
+        assert image_text2img.shape == (1, 128, 128, 3)
+
 
 class PipelineTesterMixin(unittest.TestCase):
     def tearDown(self):

From 4a76e5d49b290f39d4df70b777ea70c523db5fc2 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 20 Oct 2022 13:29:04 +0200
Subject: [PATCH 016/269] [PNDM Scheduler] Make sure list cannot grow forever
 (#882)

---
 src/diffusers/schedulers/scheduling_pndm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/diffusers/schedulers/scheduling_pndm.py b/src/diffusers/schedulers/scheduling_pndm.py
index 99ccc6c66f20..829329d2e10b 100644
--- a/src/diffusers/schedulers/scheduling_pndm.py
+++ b/src/diffusers/schedulers/scheduling_pndm.py
@@ -310,6 +310,7 @@ def step_plms(
         prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
 
         if self.counter != 1:
+            self.ets = self.ets[-3:]
             self.ets.append(model_output)
         else:
             prev_timestep = timestep

From db19a9d9d7d8fafb372675c16fcdde6674d4e1ab Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 20 Oct 2022 13:30:01 +0200
Subject: [PATCH 017/269] =?UTF-8?q?[DiffusionPipeline.from=5Fpretrained]?=
 =?UTF-8?q?=20add=20warning=20when=20passing=20unused=20k=E2=80=A6=20(#870?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[DiffusionPipeline.from_pretrained] add warning when passing unused kwargs
---
 src/diffusers/pipeline_utils.py      |  7 ++++-
 src/diffusers/utils/testing_utils.py | 41 ++++++++++++++++++++++++++++
 tests/test_pipelines.py              | 12 +++++++-
 3 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 1c787a41ae89..5d186f35ea11 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -350,6 +350,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         """
         cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
         resume_download = kwargs.pop("resume_download", False)
+        force_download = kwargs.pop("force_download", False)
         proxies = kwargs.pop("proxies", None)
         local_files_only = kwargs.pop("local_files_only", False)
         use_auth_token = kwargs.pop("use_auth_token", None)
@@ -367,6 +368,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 pretrained_model_name_or_path,
                 cache_dir=cache_dir,
                 resume_download=resume_download,
+                force_download=force_download,
                 proxies=proxies,
                 local_files_only=local_files_only,
                 use_auth_token=use_auth_token,
@@ -439,7 +441,10 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         expected_modules = set(inspect.signature(pipeline_class.__init__).parameters.keys()) - set(["self"])
         passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
 
-        init_dict, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
+        init_dict, unused_kwargs = pipeline_class.extract_init_dict(config_dict, **kwargs)
+
+        if len(unused_kwargs) > 0:
+            logger.warning(f"Keyword arguments {unused_kwargs} not recognized.")
 
         init_kwargs = {}
 
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index 682a7471f6af..8289e01e7dfa 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -1,9 +1,11 @@
 import inspect
+import logging
 import os
 import random
 import re
 import unittest
 from distutils.util import strtobool
+from io import StringIO
 from pathlib import Path
 from typing import Union
 
@@ -284,3 +286,42 @@ def summary_failures_short(tr):
     tr._tw = orig_writer
     tr.reportchars = orig_reportchars
     config.option.tbstyle = orig_tbstyle
+
+
+class CaptureLogger:
+    """
+    Args:
+    Context manager to capture `logging` streams
+        logger: 'logging` logger object
+    Returns:
+        The captured output is available via `self.out`
+    Example:
+    ```python
+    >>> from diffusers import logging
+    >>> from diffusers.testing_utils import CaptureLogger
+
+    >>> msg = "Testing 1, 2, 3"
+    >>> logging.set_verbosity_info()
+    >>> logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.py")
+    >>> with CaptureLogger(logger) as cl:
+    ...     logger.info(msg)
+    >>> assert cl.out, msg + "\n"
+    ```
+    """
+
+    def __init__(self, logger):
+        self.logger = logger
+        self.io = StringIO()
+        self.sh = logging.StreamHandler(self.io)
+        self.out = ""
+
+    def __enter__(self):
+        self.logger.addHandler(self.sh)
+        return self
+
+    def __exit__(self, *exc):
+        self.logger.removeHandler(self.sh)
+        self.out = self.io.getvalue()
+
+    def __repr__(self):
+        return f"captured: {self.out}\n"
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 1de2e6f5634f..3efedd503e77 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -51,11 +51,12 @@
     UNet2DConditionModel,
     UNet2DModel,
     VQModel,
+    logging,
 )
 from diffusers.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from diffusers.utils import CONFIG_NAME, WEIGHTS_NAME, floats_tensor, load_image, slow, torch_device
-from diffusers.utils.testing_utils import get_tests_dir
+from diffusers.utils.testing_utils import CaptureLogger, get_tests_dir
 from packaging import version
 from PIL import Image
 from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPTokenizer
@@ -1473,6 +1474,15 @@ def test_smart_download(self):
             # is not downloaded, but all the expected ones
             assert not os.path.isfile(os.path.join(snapshot_dir, "big_array.npy"))
 
+    def test_warning_unused_kwargs(self):
+        model_id = "hf-internal-testing/unet-pipeline-dummy"
+        logger = logging.get_logger("diffusers.pipeline_utils")
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            with CaptureLogger(logger) as cap_logger:
+                DiffusionPipeline.from_pretrained(model_id, not_used=True, cache_dir=tmpdirname, force_download=True)
+
+        assert cap_logger.out == "Keyword arguments {'not_used': True} not recognized.\n"
+
     @property
     def dummy_safety_checker(self):
         def check(images, *args, **kwargs):

From ce7d96681ca85f44b2a57406d3234613272a0af5 Mon Sep 17 00:00:00 2001
From: Hanusz Leszek <leszek.hanusz@gmail.com>
Date: Thu, 20 Oct 2022 13:44:37 +0200
Subject: [PATCH 018/269] DOC Dreambooth Add --sample_batch_size=1 to the 8 GB
 dreambooth example script (#829)

Add --sample_batch_size=1 to the 8 GB dreambooth script
---
 examples/dreambooth/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/dreambooth/README.md b/examples/dreambooth/README.md
index 62ab1b88c0cb..dfe54469d403 100644
--- a/examples/dreambooth/README.md
+++ b/examples/dreambooth/README.md
@@ -151,6 +151,7 @@ accelerate launch train_dreambooth.py \
   --class_prompt="a photo of dog" \
   --resolution=512 \
   --train_batch_size=1 \
+  --sample_batch_size=1 \
   --gradient_accumulation_steps=1 --gradient_checkpointing \
   --learning_rate=5e-6 \
   --lr_scheduler="constant" \

From a5eb7f4293ee4dc4ff38fa09fd2b83d3ee243743 Mon Sep 17 00:00:00 2001
From: Mikail Duzenli <45072645+MikailINTech@users.noreply.github.com>
Date: Thu, 20 Oct 2022 13:47:13 +0200
Subject: [PATCH 019/269] [Examples] add speech to image pipeline example
 (#897)

* First draft

* created the SpeechToImagePipeline class

* Corrected speech_to_image_diffusion.py style

* Added safety checker

* Corrected style

* Adding examples to README
---
 examples/community/README.md                  |  48 ++++
 .../community/speech_to_image_diffusion.py    | 261 ++++++++++++++++++
 2 files changed, 309 insertions(+)
 create mode 100644 examples/community/speech_to_image_diffusion.py

diff --git a/examples/community/README.md b/examples/community/README.md
index 54a3093ad3a3..9dc98c1ead2f 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -13,6 +13,7 @@ If a community doesn't work as expected, please open an issue and ping the autho
 | Stable Diffusion Interpolation         | Interpolate the latent space of Stable Diffusion between different prompts/seeds                                                                                                                                                                                                                                                                                                                                                                                                                         | [Stable Diffusion Interpolation](#stable-diffusion-interpolation) | -                                                                                                                                                                                                                  |                    [Nate Raw](https://github.com/nateraw/) |
 | Stable Diffusion Mega                  | **One** Stable Diffusion Pipeline with all functionalities of [Text2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py), [Image2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) and [Inpainting](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | [Stable Diffusion Mega](#stable-diffusion-mega)                   | -                                                                                                                                                                                                                  | [Patrick von Platen](https://github.com/patrickvonplaten/) |
 | Long Prompt Weighting Stable Diffusion | **One** Stable Diffusion Pipeline without tokens length limit, and support parsing weighting in prompt.                                                                                                                                                                                                                                                                                                                                                                                                  | [Long Prompt Weighting Stable Diffusion](#long-prompt-weighting-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [SkyTNT](https://github.com/SkyTNT) |
+| Speech to Image                        | Using automatic-speech-recognition to transcribe text and Stable Diffusion to generate images                                                                                                                                                                                                                                                                                                                                                                                                            | [Speech to Image](#speech-to-image)                               | -                                                                                                                                                                                                                  | [Mikail Duzenli](https://github.com/MikailINTech)
 
 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
 ```py
@@ -216,3 +217,50 @@ pipe.text2img(prompt,negative_prompt=neg_prompt, width=512, height=512, max_embe
 ```
 
 if you see `Token indices sequence length is longer than the specified maximum sequence length for this model ( *** > 77 ) . Running this sequence through the model will result in indexing errors`. Do not worry, it is normal.
+
+### Speech to Image
+
+The following code can generate an image from an audio sample using pre-trained OpenAI whisper-small and Stable Diffusion.
+
+```Python
+import torch
+
+import matplotlib.pyplot as plt
+from datasets import load_dataset
+from diffusers import DiffusionPipeline
+from transformers import (
+    WhisperForConditionalGeneration,
+    WhisperProcessor,
+)
+
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+audio_sample = ds[3]
+
+text = audio_sample["text"].lower()
+speech_data = audio_sample["audio"]["array"]
+
+model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
+processor = WhisperProcessor.from_pretrained("openai/whisper-small")
+
+diffuser_pipeline = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="speech_to_image_diffusion",
+    speech_model=model,
+    speech_processor=processor,
+    revision="fp16",
+    torch_dtype=torch.float16,
+)
+
+diffuser_pipeline.enable_attention_slicing()
+diffuser_pipeline = diffuser_pipeline.to(device)
+
+output = diffuser_pipeline(speech_data)
+plt.imshow(output.images[0])
+```
+This example produces the following image:
+
+![image](https://user-images.githubusercontent.com/45072645/196901736-77d9c6fc-63ee-4072-90b0-dc8b903d63e3.png)
diff --git a/examples/community/speech_to_image_diffusion.py b/examples/community/speech_to_image_diffusion.py
new file mode 100644
index 000000000000..8b24914cb78f
--- /dev/null
+++ b/examples/community/speech_to_image_diffusion.py
@@ -0,0 +1,261 @@
+import inspect
+from typing import Callable, List, Optional, Union
+
+import torch
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DiffusionPipeline,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.utils import logging
+from transformers import (
+    CLIPFeatureExtractor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    WhisperForConditionalGeneration,
+    WhisperProcessor,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class SpeechToImagePipeline(DiffusionPipeline):
+    def __init__(
+        self,
+        speech_model: WhisperForConditionalGeneration,
+        speech_processor: WhisperProcessor,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+
+        if safety_checker is None:
+            logger.warn(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            speech_model=speech_model,
+            speech_processor=speech_processor,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        if slice_size == "auto":
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        self.enable_attention_slicing(None)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        audio,
+        sampling_rate=16_000,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        inputs = self.speech_processor.feature_extractor(
+            audio, return_tensors="pt", sampling_rate=sampling_rate
+        ).input_features.to(self.device)
+        predicted_ids = self.speech_model.generate(inputs, max_length=480_000)
+
+        prompt = self.speech_processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True, normalize=True)[
+            0
+        ]
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""]
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not exist on mps
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(self.device)
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return image
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None)

From 7674a36a34556c9f478f63c3bfb0670ad398f892 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Thu, 20 Oct 2022 13:52:11 +0200
Subject: [PATCH 020/269] [dreambooth] dont use safety check when generating
 prior images (#922)

dont' use safety check when generating prior images
---
 examples/dreambooth/README.md           | 1 -
 examples/dreambooth/requirements.txt    | 1 +
 examples/dreambooth/train_dreambooth.py | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/dreambooth/README.md b/examples/dreambooth/README.md
index dfe54469d403..fb4bb900a953 100644
--- a/examples/dreambooth/README.md
+++ b/examples/dreambooth/README.md
@@ -10,7 +10,6 @@ The `train_dreambooth.py` script shows how to implement the training procedure a
 Before running the scripts, make sure to install the library's training dependencies:
 
 ```bash
-pip install git+https://github.com/huggingface/diffusers.git
 pip install -U -r requirements.txt
 ```
 
diff --git a/examples/dreambooth/requirements.txt b/examples/dreambooth/requirements.txt
index c0649bbe2bef..2174abfabbff 100644
--- a/examples/dreambooth/requirements.txt
+++ b/examples/dreambooth/requirements.txt
@@ -1,3 +1,4 @@
+diffusers>==0.5.0
 accelerate
 torchvision
 transformers>=4.21.0
diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index 40d5bca45de9..4fc483216954 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -343,7 +343,7 @@ def main():
         if cur_class_images < args.num_class_images:
             torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
             pipeline = StableDiffusionPipeline.from_pretrained(
-                args.pretrained_model_name_or_path, torch_dtype=torch_dtype
+                args.pretrained_model_name_or_path, torch_dtype=torch_dtype, safety_checker=None
             )
             pipeline.set_progress_bar_config(disable=True)
 

From 4bf675f4652759b42280103cb84ab0101cf23382 Mon Sep 17 00:00:00 2001
From: Hanusz Leszek <leszek.hanusz@gmail.com>
Date: Thu, 20 Oct 2022 15:56:15 +0200
Subject: [PATCH 021/269] Dreambooth class image generation: using unique names
 to avoid overwriting existing image (#847)

* Add an underscore to filename if it already exists

* Use sha1sum hash instead of adding underscores
---
 examples/dreambooth/train_dreambooth.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index 4fc483216954..635166fb4081 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -1,4 +1,5 @@
 import argparse
+import hashlib
 import itertools
 import math
 import os
@@ -362,7 +363,9 @@ def main():
                 images = pipeline(example["prompt"]).images
 
                 for i, image in enumerate(images):
-                    image.save(class_images_dir / f"{example['index'][i] + cur_class_images}.jpg")
+                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
 
             del pipeline
             if torch.cuda.is_available():

From 8be48507a0ea4c9e6771fbfbac1333ee7bb90993 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Thu, 20 Oct 2022 16:25:12 +0200
Subject: [PATCH 022/269] fix test_components (#928)

---
 tests/test_pipelines.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 3efedd503e77..bbda797bf694 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -1405,7 +1405,7 @@ def test_components(self):
         mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
 
         # make sure here that pndm scheduler skips prk
-        inpaint = StableDiffusionInpaintPipeline(
+        inpaint = StableDiffusionInpaintPipelineLegacy(
             unet=unet,
             scheduler=scheduler,
             vae=vae,
@@ -1413,9 +1413,9 @@ def test_components(self):
             tokenizer=tokenizer,
             safety_checker=self.dummy_safety_checker,
             feature_extractor=self.dummy_extractor,
-        )
-        img2img = StableDiffusionImg2ImgPipeline(**inpaint.components)
-        text2img = StableDiffusionPipeline(**inpaint.components)
+        ).to(torch_device)
+        img2img = StableDiffusionImg2ImgPipeline(**inpaint.components).to(torch_device)
+        text2img = StableDiffusionPipeline(**inpaint.components).to(torch_device)
 
         prompt = "A painting of a squirrel eating a burger"
         generator = torch.Generator(device=torch_device).manual_seed(0)

From 6f6eef747ce2943d2b76d845b55220fd07fcf177 Mon Sep 17 00:00:00 2001
From: Krishna Penukonda <krishna@hypotenuse.ai>
Date: Fri, 21 Oct 2022 01:23:42 +0800
Subject: [PATCH 023/269] Fix Compatibility with Nvidia NGC Containers (#919)

Check if MPS backend is registered before calling is_available()
---
 src/diffusers/utils/testing_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index 8289e01e7dfa..77963ad7242e 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -29,7 +29,9 @@
     )
 
     if is_torch_higher_equal_than_1_12:
-        torch_device = "mps" if torch.backends.mps.is_available() else torch_device
+        # Some builds of torch 1.12 don't have the mps backend registered. See #892 for more details
+        mps_backend_registered = hasattr(torch.backends, "mps")
+        torch_device = "mps" if (mps_backend_registered and torch.backends.mps.is_available()) else torch_device
 
 
 def get_tests_dir(append_path=None):

From ba74a8be7a620da0558f27802a19736627e9e64a Mon Sep 17 00:00:00 2001
From: SkyTNT <SKYTNT@outlook.com>
Date: Fri, 21 Oct 2022 01:26:04 +0800
Subject: [PATCH 024/269] [Community Pipelines] Fix pad_tokens_and_weights in
 lpw_stable_diffusion (#925)

[Community Pipelines] fix pad_tokens_and_weights in lpw_stable_diffusion
---
 examples/community/lpw_stable_diffusion.py    | 73 ++++++++++++++-----
 .../community/lpw_stable_diffusion_onnx.py    | 51 +++++++++----
 2 files changed, 90 insertions(+), 34 deletions(-)

diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py
index 3d4ec23e3aea..4980f8c8be06 100644
--- a/examples/community/lpw_stable_diffusion.py
+++ b/examples/community/lpw_stable_diffusion.py
@@ -132,6 +132,7 @@ def get_prompts_with_weights(pipe: DiffusionPipeline, prompt: List[str], max_len
     """
     tokens = []
     weights = []
+    truncated = False
     for text in prompt:
         texts_and_weights = parse_prompt_attention(text)
         text_token = []
@@ -140,21 +141,21 @@ def get_prompts_with_weights(pipe: DiffusionPipeline, prompt: List[str], max_len
             # tokenize and discard the starting and the ending token
             token = pipe.tokenizer(word).input_ids[1:-1]
             text_token += token
-
             # copy the weight by length of token
             text_weight += [weight] * len(token)
-
             # stop if the text is too long (longer than truncation limit)
             if len(text_token) > max_length:
+                truncated = True
                 break
-
         # truncate
         if len(text_token) > max_length:
+            truncated = True
             text_token = text_token[:max_length]
             text_weight = text_weight[:max_length]
-
         tokens.append(text_token)
         weights.append(text_weight)
+    if truncated:
+        logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
     return tokens, weights
 
 
@@ -173,9 +174,9 @@ def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, no_boseos_midd
             if len(weights[i]) == 0:
                 w = [1.0] * weights_length
             else:
-                for j in range((len(weights[i]) - 1) // chunk_length + 1):
+                for j in range(max_embeddings_multiples):
                     w.append(1.0)  # weight for starting token in this chunk
-                    w += weights[i][j * chunk_length : min(len(weights[i]), (j + 1) * chunk_length)]
+                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
                     w.append(1.0)  # weight for ending token in this chunk
                 w += [1.0] * (weights_length - len(w))
             weights[i] = w[:]
@@ -184,7 +185,10 @@ def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, no_boseos_midd
 
 
 def get_unweighted_text_embeddings(
-    pipe: DiffusionPipeline, text_input: torch.Tensor, chunk_length: int, no_boseos_middle: Optional[bool] = True
+    pipe: DiffusionPipeline,
+    text_input: torch.Tensor,
+    chunk_length: int,
+    no_boseos_middle: Optional[bool] = True,
 ):
     """
     When the length of tokens is a multiple of the capacity of the text encoder,
@@ -285,7 +289,8 @@ def get_weighted_text_embeddings(
         max_length = max(max_length, max([len(token) for token in uncond_tokens]))
 
     max_embeddings_multiples = min(
-        max_embeddings_multiples, (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1
+        max_embeddings_multiples,
+        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
     )
     max_embeddings_multiples = max(1, max_embeddings_multiples)
     max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
@@ -317,12 +322,18 @@ def get_weighted_text_embeddings(
 
     # get the embeddings
     text_embeddings = get_unweighted_text_embeddings(
-        pipe, prompt_tokens, pipe.tokenizer.model_max_length, no_boseos_middle=no_boseos_middle
+        pipe,
+        prompt_tokens,
+        pipe.tokenizer.model_max_length,
+        no_boseos_middle=no_boseos_middle,
     )
     prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=pipe.device)
     if uncond_prompt is not None:
         uncond_embeddings = get_unweighted_text_embeddings(
-            pipe, uncond_tokens, pipe.tokenizer.model_max_length, no_boseos_middle=no_boseos_middle
+            pipe,
+            uncond_tokens,
+            pipe.tokenizer.model_max_length,
+            no_boseos_middle=no_boseos_middle,
         )
         uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=pipe.device)
 
@@ -632,16 +643,29 @@ def __call__(
             # Unlike in other pipelines, latents need to be generated in the target device
             # for 1-to-1 results reproducibility with the CompVis implementation.
             # However this currently doesn't work in `mps`.
-            latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
+            latents_shape = (
+                batch_size * num_images_per_prompt,
+                self.unet.in_channels,
+                height // 8,
+                width // 8,
+            )
 
             if latents is None:
                 if self.device.type == "mps":
                     # randn does not exist on mps
-                    latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
-                        self.device
-                    )
+                    latents = torch.randn(
+                        latents_shape,
+                        generator=generator,
+                        device="cpu",
+                        dtype=latents_dtype,
+                    ).to(self.device)
                 else:
-                    latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+                    latents = torch.randn(
+                        latents_shape,
+                        generator=generator,
+                        device=self.device,
+                        dtype=latents_dtype,
+                    )
             else:
                 if latents.shape != latents_shape:
                     raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
@@ -684,11 +708,19 @@ def __call__(
             # add noise to latents using the timesteps
             if self.device.type == "mps":
                 # randn does not exist on mps
-                noise = torch.randn(init_latents.shape, generator=generator, device="cpu", dtype=latents_dtype).to(
-                    self.device
-                )
+                noise = torch.randn(
+                    init_latents.shape,
+                    generator=generator,
+                    device="cpu",
+                    dtype=latents_dtype,
+                ).to(self.device)
             else:
-                noise = torch.randn(init_latents.shape, generator=generator, device=self.device, dtype=latents_dtype)
+                noise = torch.randn(
+                    init_latents.shape,
+                    generator=generator,
+                    device=self.device,
+                    dtype=latents_dtype,
+                )
             latents = self.scheduler.add_noise(init_latents, noise, timesteps)
 
             t_start = max(num_inference_steps - init_timestep + offset, 0)
@@ -741,7 +773,8 @@ def __call__(
                 self.device
             )
             image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype),
             )
         else:
             has_nsfw_concept = None
diff --git a/examples/community/lpw_stable_diffusion_onnx.py b/examples/community/lpw_stable_diffusion_onnx.py
index b5b9a2a65fb0..4ca37c0c4ad4 100644
--- a/examples/community/lpw_stable_diffusion_onnx.py
+++ b/examples/community/lpw_stable_diffusion_onnx.py
@@ -130,6 +130,7 @@ def get_prompts_with_weights(pipe, prompt: List[str], max_length: int):
     """
     tokens = []
     weights = []
+    truncated = False
     for text in prompt:
         texts_and_weights = parse_prompt_attention(text)
         text_token = []
@@ -138,21 +139,21 @@ def get_prompts_with_weights(pipe, prompt: List[str], max_length: int):
             # tokenize and discard the starting and the ending token
             token = pipe.tokenizer(word, return_tensors="np").input_ids[0, 1:-1]
             text_token += list(token)
-
             # copy the weight by length of token
             text_weight += [weight] * len(token)
-
             # stop if the text is too long (longer than truncation limit)
             if len(text_token) > max_length:
+                truncated = True
                 break
-
         # truncate
         if len(text_token) > max_length:
+            truncated = True
             text_token = text_token[:max_length]
             text_weight = text_weight[:max_length]
-
         tokens.append(text_token)
         weights.append(text_weight)
+    if truncated:
+        logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
     return tokens, weights
 
 
@@ -171,9 +172,9 @@ def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, no_boseos_midd
             if len(weights[i]) == 0:
                 w = [1.0] * weights_length
             else:
-                for j in range((len(weights[i]) - 1) // chunk_length + 1):
+                for j in range(max_embeddings_multiples):
                     w.append(1.0)  # weight for starting token in this chunk
-                    w += weights[i][j * chunk_length : min(len(weights[i]), (j + 1) * chunk_length)]
+                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
                     w.append(1.0)  # weight for ending token in this chunk
                 w += [1.0] * (weights_length - len(w))
             weights[i] = w[:]
@@ -182,7 +183,10 @@ def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, no_boseos_midd
 
 
 def get_unweighted_text_embeddings(
-    pipe, text_input: np.array, chunk_length: int, no_boseos_middle: Optional[bool] = True
+    pipe,
+    text_input: np.array,
+    chunk_length: int,
+    no_boseos_middle: Optional[bool] = True,
 ):
     """
     When the length of tokens is a multiple of the capacity of the text encoder,
@@ -276,7 +280,10 @@ def get_weighted_text_embeddings(
             uncond_tokens = [
                 token[1:-1]
                 for token in pipe.tokenizer(
-                    uncond_prompt, max_length=max_length, truncation=True, return_tensors="np"
+                    uncond_prompt,
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="np",
                 ).input_ids
             ]
             uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
@@ -287,7 +294,8 @@ def get_weighted_text_embeddings(
         max_length = max(max_length, max([len(token) for token in uncond_tokens]))
 
     max_embeddings_multiples = min(
-        max_embeddings_multiples, (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1
+        max_embeddings_multiples,
+        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
     )
     max_embeddings_multiples = max(1, max_embeddings_multiples)
     max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
@@ -319,12 +327,18 @@ def get_weighted_text_embeddings(
 
     # get the embeddings
     text_embeddings = get_unweighted_text_embeddings(
-        pipe, prompt_tokens, pipe.tokenizer.model_max_length, no_boseos_middle=no_boseos_middle
+        pipe,
+        prompt_tokens,
+        pipe.tokenizer.model_max_length,
+        no_boseos_middle=no_boseos_middle,
     )
     prompt_weights = np.array(prompt_weights, dtype=text_embeddings.dtype)
     if uncond_prompt is not None:
         uncond_embeddings = get_unweighted_text_embeddings(
-            pipe, uncond_tokens, pipe.tokenizer.model_max_length, no_boseos_middle=no_boseos_middle
+            pipe,
+            uncond_tokens,
+            pipe.tokenizer.model_max_length,
+            no_boseos_middle=no_boseos_middle,
         )
         uncond_weights = np.array(uncond_weights, dtype=uncond_embeddings.dtype)
 
@@ -559,7 +573,12 @@ def __call__(
         noise = None
 
         if init_image is None:
-            latents_shape = (batch_size * num_images_per_prompt, 4, height // 8, width // 8)
+            latents_shape = (
+                batch_size * num_images_per_prompt,
+                4,
+                height // 8,
+                width // 8,
+            )
 
             if latents is None:
                 latents = generator.randn(*latents_shape).astype(latents_dtype)
@@ -625,7 +644,9 @@ def __call__(
 
             # predict the noise residual
             noise_pred = self.unet(
-                sample=latent_model_input, timestep=np.array([t]), encoder_hidden_states=text_embeddings
+                sample=latent_model_input,
+                timestep=np.array([t]),
+                encoder_hidden_states=text_embeddings,
             )
             noise_pred = noise_pred[0]
 
@@ -640,7 +661,9 @@ def __call__(
             if mask is not None:
                 # masking
                 init_latents_proper = self.scheduler.add_noise(
-                    torch.from_numpy(init_latents_orig), torch.from_numpy(noise), torch.tensor([t])
+                    torch.from_numpy(init_latents_orig),
+                    torch.from_numpy(noise),
+                    torch.tensor([t]),
                 ).numpy()
                 latents = (init_latents_proper * mask) + (latents * (1 - mask))
 

From cc36f2e7ff9414a27e6f4f1ff3c70f4baf38afe2 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Thu, 20 Oct 2022 20:25:20 +0200
Subject: [PATCH 025/269] Bump the version to 0.7.0.dev0 (#912)

* Bump the version to 0.7.0.dev0

* deprecate offsets

* deprecate LMS timesteps

* LMS 0.7.0->0.8.0
---
 setup.py                                            |  2 +-
 src/diffusers/__init__.py                           |  2 +-
 src/diffusers/schedulers/scheduling_ddim.py         | 11 +++--------
 src/diffusers/schedulers/scheduling_lms_discrete.py |  4 ++--
 src/diffusers/schedulers/scheduling_pndm.py         |  9 ++-------
 5 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/setup.py b/setup.py
index 7532feb5a360..b4c1613ba115 100644
--- a/setup.py
+++ b/setup.py
@@ -211,7 +211,7 @@ def run(self):
 
 setup(
     name="diffusers",
-    version="0.6.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="0.7.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     description="Diffusers",
     long_description=open("README.md", "r", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 02d98fb71c31..8a0dbc9de125 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -9,7 +9,7 @@
 )
 
 
-__version__ = "0.6.0"
+__version__ = "0.7.0.dev0"
 
 from .configuration_utils import ConfigMixin
 from .onnx_utils import OnnxRuntimeModel
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 612c24f807a1..b94cada4ce64 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -23,7 +23,7 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, deprecate
+from ..utils import BaseOutput
 from .scheduling_utils import SchedulerMixin
 
 
@@ -175,7 +175,7 @@ def _get_variance(self, timestep, prev_timestep):
 
         return variance
 
-    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None, **kwargs):
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
         """
         Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
 
@@ -183,18 +183,13 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
             num_inference_steps (`int`):
                 the number of diffusion steps used when generating samples with a pre-trained model.
         """
-        deprecated_offset = deprecate(
-            "offset", "0.7.0", "Please pass `steps_offset` to `__init__` instead.", take_from=kwargs
-        )
-        offset = deprecated_offset or self.config.steps_offset
-
         self.num_inference_steps = num_inference_steps
         step_ratio = self.config.num_train_timesteps // self.num_inference_steps
         # creates integer timesteps by multiplying by ratio
         # casting to int to avoid issues when num_inference_step is power of 3
         timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
         self.timesteps = torch.from_numpy(timesteps).to(device)
-        self.timesteps += offset
+        self.timesteps += self.config.steps_offset
 
     def step(
         self,
diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py
index 1b8ca7c5df8d..2a5c1c39543b 100644
--- a/src/diffusers/schedulers/scheduling_lms_discrete.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete.py
@@ -209,7 +209,7 @@ def step(
         ):
             deprecate(
                 "timestep as an index",
-                "0.7.0",
+                "0.8.0",
                 "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
                 " `LMSDiscreteScheduler.step()` will not be supported in future versions. Make sure to pass"
                 " one of the `scheduler.timesteps` as a timestep.",
@@ -259,7 +259,7 @@ def add_noise(
         if isinstance(timesteps, torch.IntTensor) or isinstance(timesteps, torch.LongTensor):
             deprecate(
                 "timesteps as indices",
-                "0.7.0",
+                "0.8.0",
                 "Passing integer indices  (e.g. from `enumerate(timesteps)`) as timesteps to"
                 " `LMSDiscreteScheduler.add_noise()` will not be supported in future versions. Make sure to"
                 " pass values from `scheduler.timesteps` as timesteps.",
diff --git a/src/diffusers/schedulers/scheduling_pndm.py b/src/diffusers/schedulers/scheduling_pndm.py
index 829329d2e10b..03ea5c15d1b0 100644
--- a/src/diffusers/schedulers/scheduling_pndm.py
+++ b/src/diffusers/schedulers/scheduling_pndm.py
@@ -21,7 +21,6 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import deprecate
 from .scheduling_utils import SchedulerMixin, SchedulerOutput
 
 
@@ -142,7 +141,7 @@ def __init__(
         self.plms_timesteps = None
         self.timesteps = None
 
-    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None, **kwargs):
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
         """
         Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
 
@@ -150,17 +149,13 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
             num_inference_steps (`int`):
                 the number of diffusion steps used when generating samples with a pre-trained model.
         """
-        deprecated_offset = deprecate(
-            "offset", "0.7.0", "Please pass `steps_offset` to `__init__` instead.", take_from=kwargs
-        )
-        offset = deprecated_offset or self.config.steps_offset
 
         self.num_inference_steps = num_inference_steps
         step_ratio = self.config.num_train_timesteps // self.num_inference_steps
         # creates integer timesteps by multiplying by ratio
         # casting to int to avoid issues when num_inference_step is power of 3
         self._timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()
-        self._timesteps += offset
+        self._timesteps += self.config.steps_offset
 
         if self.config.skip_prk_steps:
             # for some models like stable diffusion the prk steps can/should be skipped to

From 32bf4fdc4386809c870528cb261028baae012d27 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Thu, 20 Oct 2022 20:26:03 +0200
Subject: [PATCH 026/269] Introduce the copy mechanism (#924)

* Introduce the copy mechanism

* init tests

* fix dummy tests

* with

* update copies tests
---
 .github/workflows/pr_quality.yml              |  17 ++
 Makefile                                      |   1 +
 src/diffusers/schedulers/scheduling_ddim.py   |   1 +
 .../schedulers/scheduling_lms_discrete.py     |   1 +
 tests/repo_utils/test_check_copies.py         | 120 ++++++++
 tests/repo_utils/test_check_dummies.py        | 124 ++++++++
 utils/check_copies.py                         | 281 +-----------------
 utils/check_dummies.py                        |   7 +-
 8 files changed, 284 insertions(+), 268 deletions(-)
 create mode 100644 tests/repo_utils/test_check_copies.py
 create mode 100644 tests/repo_utils/test_check_dummies.py

diff --git a/.github/workflows/pr_quality.yml b/.github/workflows/pr_quality.yml
index 5a850bea47a1..8d6e20efe364 100644
--- a/.github/workflows/pr_quality.yml
+++ b/.github/workflows/pr_quality.yml
@@ -31,3 +31,20 @@ jobs:
           isort --check-only examples tests src utils scripts
           flake8 examples tests src utils scripts
           doc-builder style src/diffusers docs/source --max_len 119 --check_only --path_to_docs docs/source
+
+  check_repository_consistency:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.7"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[quality]
+      - name: Check quality
+        run: |
+          python utils/check_copies.py
+          python utils/check_dummies.py
diff --git a/Makefile b/Makefile
index 6e513e2e4743..ea7537f2fd85 100644
--- a/Makefile
+++ b/Makefile
@@ -67,6 +67,7 @@ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency
 # Make marked copies of snippets of codes conform to the original
 
 fix-copies:
+	python utils/check_copies.py --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite
 
 # Run tests for the library
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index b94cada4ce64..58e40b57e923 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -28,6 +28,7 @@
 
 
 @dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
 class DDIMSchedulerOutput(BaseOutput):
     """
     Output class for the scheduler's step function output.
diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py
index 2a5c1c39543b..768413e9e6a3 100644
--- a/src/diffusers/schedulers/scheduling_lms_discrete.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete.py
@@ -26,6 +26,7 @@
 
 
 @dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->LMSDiscrete
 class LMSDiscreteSchedulerOutput(BaseOutput):
     """
     Output class for the scheduler's step function output.
diff --git a/tests/repo_utils/test_check_copies.py b/tests/repo_utils/test_check_copies.py
new file mode 100644
index 000000000000..65128f68d1ac
--- /dev/null
+++ b/tests/repo_utils/test_check_copies.py
@@ -0,0 +1,120 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import shutil
+import sys
+import tempfile
+import unittest
+
+import black
+
+
+git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+sys.path.append(os.path.join(git_repo_path, "utils"))
+
+import check_copies  # noqa: E402
+
+
+# This is the reference code that will be used in the tests.
+# If DDPMSchedulerOutput is changed in scheduling_ddpm.py, this code needs to be manually updated.
+REFERENCE_CODE = """    \"""
+    Output class for the scheduler's step function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample (x_{0}) based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    \"""
+
+    prev_sample: torch.FloatTensor
+    pred_original_sample: Optional[torch.FloatTensor] = None
+"""
+
+
+class CopyCheckTester(unittest.TestCase):
+    def setUp(self):
+        self.diffusers_dir = tempfile.mkdtemp()
+        os.makedirs(os.path.join(self.diffusers_dir, "schedulers/"))
+        check_copies.DIFFUSERS_PATH = self.diffusers_dir
+        shutil.copy(
+            os.path.join(git_repo_path, "src/diffusers/schedulers/scheduling_ddpm.py"),
+            os.path.join(self.diffusers_dir, "schedulers/scheduling_ddpm.py"),
+        )
+
+    def tearDown(self):
+        check_copies.DIFFUSERS_PATH = "src/diffusers"
+        shutil.rmtree(self.diffusers_dir)
+
+    def check_copy_consistency(self, comment, class_name, class_code, overwrite_result=None):
+        code = comment + f"\nclass {class_name}(nn.Module):\n" + class_code
+        if overwrite_result is not None:
+            expected = comment + f"\nclass {class_name}(nn.Module):\n" + overwrite_result
+        mode = black.Mode(target_versions={black.TargetVersion.PY35}, line_length=119)
+        code = black.format_str(code, mode=mode)
+        fname = os.path.join(self.diffusers_dir, "new_code.py")
+        with open(fname, "w", newline="\n") as f:
+            f.write(code)
+        if overwrite_result is None:
+            self.assertTrue(len(check_copies.is_copy_consistent(fname)) == 0)
+        else:
+            check_copies.is_copy_consistent(f.name, overwrite=True)
+            with open(fname, "r") as f:
+                self.assertTrue(f.read(), expected)
+
+    def test_find_code_in_diffusers(self):
+        code = check_copies.find_code_in_diffusers("schedulers.scheduling_ddpm.DDPMSchedulerOutput")
+        self.assertEqual(code, REFERENCE_CODE)
+
+    def test_is_copy_consistent(self):
+        # Base copy consistency
+        self.check_copy_consistency(
+            "# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput",
+            "DDPMSchedulerOutput",
+            REFERENCE_CODE + "\n",
+        )
+
+        # With no empty line at the end
+        self.check_copy_consistency(
+            "# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput",
+            "DDPMSchedulerOutput",
+            REFERENCE_CODE,
+        )
+
+        # Copy consistency with rename
+        self.check_copy_consistency(
+            "# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->Test",
+            "TestSchedulerOutput",
+            re.sub("DDPM", "Test", REFERENCE_CODE),
+        )
+
+        # Copy consistency with a really long name
+        long_class_name = "TestClassWithAReallyLongNameBecauseSomePeopleLikeThatForSomeReason"
+        self.check_copy_consistency(
+            f"# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->{long_class_name}",
+            f"{long_class_name}SchedulerOutput",
+            re.sub("Bert", long_class_name, REFERENCE_CODE),
+        )
+
+        # Copy consistency with overwrite
+        self.check_copy_consistency(
+            "# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->Test",
+            "TestSchedulerOutput",
+            REFERENCE_CODE,
+            overwrite_result=re.sub("DDPM", "Test", REFERENCE_CODE),
+        )
diff --git a/tests/repo_utils/test_check_dummies.py b/tests/repo_utils/test_check_dummies.py
new file mode 100644
index 000000000000..d8fa9ce10547
--- /dev/null
+++ b/tests/repo_utils/test_check_dummies.py
@@ -0,0 +1,124 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import unittest
+
+
+git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+sys.path.append(os.path.join(git_repo_path, "utils"))
+
+import check_dummies
+from check_dummies import create_dummy_files, create_dummy_object, find_backend, read_init  # noqa: E402
+
+
+# Align TRANSFORMERS_PATH in check_dummies with the current path
+check_dummies.PATH_TO_DIFFUSERS = os.path.join(git_repo_path, "src", "diffusers")
+
+
+class CheckDummiesTester(unittest.TestCase):
+    def test_find_backend(self):
+        simple_backend = find_backend("    if not is_torch_available():")
+        self.assertEqual(simple_backend, "torch")
+
+        # backend_with_underscore = find_backend("    if not is_tensorflow_text_available():")
+        # self.assertEqual(backend_with_underscore, "tensorflow_text")
+
+        double_backend = find_backend("    if not (is_torch_available() and is_transformers_available()):")
+        self.assertEqual(double_backend, "torch_and_transformers")
+
+        # double_backend_with_underscore = find_backend(
+        #    "    if not (is_sentencepiece_available() and is_tensorflow_text_available()):"
+        # )
+        # self.assertEqual(double_backend_with_underscore, "sentencepiece_and_tensorflow_text")
+
+        triple_backend = find_backend(
+            "    if not (is_torch_available() and is_transformers_available() and is_onnx_available()):"
+        )
+        self.assertEqual(triple_backend, "torch_and_transformers_and_onnx")
+
+    def test_read_init(self):
+        objects = read_init()
+        # We don't assert on the exact list of keys to allow for smooth grow of backend-specific objects
+        self.assertIn("torch", objects)
+        self.assertIn("torch_and_transformers", objects)
+        self.assertIn("flax_and_transformers", objects)
+        self.assertIn("torch_and_transformers_and_onnx", objects)
+
+        # Likewise, we can't assert on the exact content of a key
+        self.assertIn("UNet2DModel", objects["torch"])
+        self.assertIn("FlaxUNet2DConditionModel", objects["flax"])
+        self.assertIn("StableDiffusionPipeline", objects["torch_and_transformers"])
+        self.assertIn("FlaxStableDiffusionPipeline", objects["flax_and_transformers"])
+        self.assertIn("LMSDiscreteScheduler", objects["torch_and_scipy"])
+        self.assertIn("OnnxStableDiffusionPipeline", objects["torch_and_transformers_and_onnx"])
+
+    def test_create_dummy_object(self):
+        dummy_constant = create_dummy_object("CONSTANT", "'torch'")
+        self.assertEqual(dummy_constant, "\nCONSTANT = None\n")
+
+        dummy_function = create_dummy_object("function", "'torch'")
+        self.assertEqual(
+            dummy_function, "\ndef function(*args, **kwargs):\n    requires_backends(function, 'torch')\n"
+        )
+
+        expected_dummy_class = """
+class FakeClass(metaclass=DummyObject):
+    _backends = 'torch'
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, 'torch')
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, 'torch')
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, 'torch')
+"""
+        dummy_class = create_dummy_object("FakeClass", "'torch'")
+        self.assertEqual(dummy_class, expected_dummy_class)
+
+    def test_create_dummy_files(self):
+        expected_dummy_pytorch_file = """# This file is autogenerated by the command `make fix-copies`, do not edit.
+# flake8: noqa
+
+from ..utils import DummyObject, requires_backends
+
+
+CONSTANT = None
+
+
+def function(*args, **kwargs):
+    requires_backends(function, ["torch"])
+
+
+class FakeClass(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+"""
+        dummy_files = create_dummy_files({"torch": ["CONSTANT", "function", "FakeClass"]})
+        self.assertEqual(dummy_files["torch"], expected_dummy_pytorch_file)
diff --git a/utils/check_copies.py b/utils/check_copies.py
index 50f02cac658d..395cefb9c48b 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
+# Copyright 2022 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 
 import argparse
 import glob
+import importlib.util
 import os
 import re
 
@@ -24,52 +25,17 @@
 
 # All paths are set with the intent you should run this script from the root of the repo with the command
 # python utils/check_copies.py
-TRANSFORMERS_PATH = "src/diffusers"
-PATH_TO_DOCS = "docs/source/en"
+DIFFUSERS_PATH = "src/diffusers"
 REPO_PATH = "."
 
-# Mapping for files that are full copies of others (keys are copies, values the file to keep them up to data with)
-FULL_COPIES = {
-    "examples/tensorflow/question-answering/utils_qa.py": "examples/pytorch/question-answering/utils_qa.py",
-    "examples/flax/question-answering/utils_qa.py": "examples/pytorch/question-answering/utils_qa.py",
-}
-
-
-LOCALIZED_READMES = {
-    # If the introduction or the conclusion of the list change, the prompts may need to be updated.
-    "README.md": {
-        "start_prompt": "🤗 Transformers currently provides the following architectures",
-        "end_prompt": "1. Want to contribute a new model?",
-        "format_model_list": (
-            "**[{title}]({model_link})** (from {paper_affiliations}) released with the paper {paper_title_link} by"
-            " {paper_authors}.{supplements}"
-        ),
-    },
-    "README_zh-hans.md": {
-        "start_prompt": "🤗 Transformers 目前支持如下的架构",
-        "end_prompt": "1. 想要贡献新的模型？",
-        "format_model_list": (
-            "**[{title}]({model_link})** (来自 {paper_affiliations}) 伴随论文 {paper_title_link} 由 {paper_authors}"
-            " 发布。{supplements}"
-        ),
-    },
-    "README_zh-hant.md": {
-        "start_prompt": "🤗 Transformers 目前支援以下的架構",
-        "end_prompt": "1. 想要貢獻新的模型？",
-        "format_model_list": (
-            "**[{title}]({model_link})** (from {paper_affiliations}) released with the paper {paper_title_link} by"
-            " {paper_authors}.{supplements}"
-        ),
-    },
-    "README_ko.md": {
-        "start_prompt": "🤗 Transformers는 다음 모델들을 제공합니다",
-        "end_prompt": "1. 새로운 모델을 올리고 싶나요?",
-        "format_model_list": (
-            "**[{title}]({model_link})** (from {paper_affiliations}) released with the paper {paper_title_link} by"
-            " {paper_authors}.{supplements}"
-        ),
-    },
-}
+
+# This is to make sure the diffusers module imported is the one in the repo.
+spec = importlib.util.spec_from_file_location(
+    "diffusers",
+    os.path.join(DIFFUSERS_PATH, "__init__.py"),
+    submodule_search_locations=[DIFFUSERS_PATH],
+)
+diffusers_module = spec.loader.load_module()
 
 
 def _should_continue(line, indent):
@@ -83,14 +49,14 @@ def find_code_in_diffusers(object_name):
 
     # First let's find the module where our object lives.
     module = parts[i]
-    while i < len(parts) and not os.path.isfile(os.path.join(TRANSFORMERS_PATH, f"{module}.py")):
+    while i < len(parts) and not os.path.isfile(os.path.join(DIFFUSERS_PATH, f"{module}.py")):
         i += 1
         if i < len(parts):
             module = os.path.join(module, parts[i])
     if i >= len(parts):
         raise ValueError(f"`object_name` should begin with the name of a module of diffusers but got {object_name}.")
 
-    with open(os.path.join(TRANSFORMERS_PATH, f"{module}.py"), "r", encoding="utf-8", newline="\n") as f:
+    with open(os.path.join(DIFFUSERS_PATH, f"{module}.py"), "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
 
     # Now let's find the class / func in the code!
@@ -121,6 +87,7 @@ def find_code_in_diffusers(object_name):
 
 _re_copy_warning = re.compile(r"^(\s*)#\s*Copied from\s+diffusers\.(\S+\.\S+)\s*($|\S.*$)")
 _re_replace_pattern = re.compile(r"^\s*(\S+)->(\S+)(\s+.*|$)")
+_re_fill_pattern = re.compile(r"<FILL\s+[^>]*>")
 
 
 def get_indent(code):
@@ -140,7 +107,7 @@ def blackify(code):
     has_indent = len(get_indent(code)) > 0
     if has_indent:
         code = f"class Bla:\n{code}"
-    mode = black.Mode(target_versions={black.TargetVersion.PY35}, line_length=119, preview=True)
+    mode = black.Mode(target_versions={black.TargetVersion.PY37}, line_length=119, preview=True)
     result = black.format_str(code, mode=mode)
     result, _ = style_docstrings_in_code(result)
     return result[len("class Bla:\n") :] if has_indent else result
@@ -149,7 +116,6 @@ def blackify(code):
 def is_copy_consistent(filename, overwrite=False):
     """
     Check if the code commented as a copy in `filename` matches the original.
-
     Return the differences or overwrites the content depending on `overwrite`.
     """
     with open(filename, "r", encoding="utf-8", newline="\n") as f:
@@ -221,7 +187,7 @@ def is_copy_consistent(filename, overwrite=False):
 
 
 def check_copies(overwrite: bool = False):
-    all_files = glob.glob(os.path.join(TRANSFORMERS_PATH, "**/*.py"), recursive=True)
+    all_files = glob.glob(os.path.join(DIFFUSERS_PATH, "**/*.py"), recursive=True)
     diffs = []
     for filename in all_files:
         new_diffs = is_copy_consistent(filename, overwrite)
@@ -235,224 +201,9 @@ def check_copies(overwrite: bool = False):
         )
 
 
-#    check_model_list_copy(overwrite=overwrite)
-
-
-def check_full_copies(overwrite: bool = False):
-    diffs = []
-    for target, source in FULL_COPIES.items():
-        with open(source, "r", encoding="utf-8") as f:
-            source_code = f.read()
-        with open(target, "r", encoding="utf-8") as f:
-            target_code = f.read()
-        if source_code != target_code:
-            if overwrite:
-                with open(target, "w", encoding="utf-8") as f:
-                    print(f"Replacing the content of {target} by the one of {source}.")
-                    f.write(source_code)
-            else:
-                diffs.append(f"- {target}: copy does not match {source}.")
-
-    if not overwrite and len(diffs) > 0:
-        diff = "\n".join(diffs)
-        raise Exception(
-            "Found the following copy inconsistencies:\n"
-            + diff
-            + "\nRun `make fix-copies` or `python utils/check_copies.py --fix_and_overwrite` to fix them."
-        )
-
-
-def get_model_list(filename, start_prompt, end_prompt):
-    """Extracts the model list from the README."""
-    with open(os.path.join(REPO_PATH, filename), "r", encoding="utf-8", newline="\n") as f:
-        lines = f.readlines()
-    # Find the start of the list.
-    start_index = 0
-    while not lines[start_index].startswith(start_prompt):
-        start_index += 1
-    start_index += 1
-
-    result = []
-    current_line = ""
-    end_index = start_index
-
-    while not lines[end_index].startswith(end_prompt):
-        if lines[end_index].startswith("1."):
-            if len(current_line) > 1:
-                result.append(current_line)
-            current_line = lines[end_index]
-        elif len(lines[end_index]) > 1:
-            current_line = f"{current_line[:-1]} {lines[end_index].lstrip()}"
-        end_index += 1
-    if len(current_line) > 1:
-        result.append(current_line)
-
-    return "".join(result)
-
-
-def convert_to_localized_md(model_list, localized_model_list, format_str):
-    """Convert `model_list` to each localized README."""
-
-    def _rep(match):
-        title, model_link, paper_affiliations, paper_title_link, paper_authors, supplements = match.groups()
-        return format_str.format(
-            title=title,
-            model_link=model_link,
-            paper_affiliations=paper_affiliations,
-            paper_title_link=paper_title_link,
-            paper_authors=paper_authors,
-            supplements=" " + supplements.strip() if len(supplements) != 0 else "",
-        )
-
-    # This regex captures metadata from an English model description, including model title, model link,
-    # affiliations of the paper, title of the paper, authors of the paper, and supplemental data (see DistilBERT for example).
-    _re_capture_meta = re.compile(
-        r"\*\*\[([^\]]*)\]\(([^\)]*)\)\*\* \(from ([^)]*)\)[^\[]*([^\)]*\)).*?by (.*?[A-Za-z\*]{2,}?)\. (.*)$"
-    )
-    # This regex is used to synchronize link.
-    _re_capture_title_link = re.compile(r"\*\*\[([^\]]*)\]\(([^\)]*)\)\*\*")
-
-    if len(localized_model_list) == 0:
-        localized_model_index = {}
-    else:
-        try:
-            localized_model_index = {
-                re.search(r"\*\*\[([^\]]*)", line).groups()[0]: line
-                for line in localized_model_list.strip().split("\n")
-            }
-        except AttributeError:
-            raise AttributeError("A model name in localized READMEs cannot be recognized.")
-
-    model_keys = [re.search(r"\*\*\[([^\]]*)", line).groups()[0] for line in model_list.strip().split("\n")]
-
-    # We exclude keys in localized README not in the main one.
-    readmes_match = not any([k not in model_keys for k in localized_model_index])
-    localized_model_index = {k: v for k, v in localized_model_index.items() if k in model_keys}
-
-    for model in model_list.strip().split("\n"):
-        title, model_link = _re_capture_title_link.search(model).groups()
-        if title not in localized_model_index:
-            readmes_match = False
-            # Add an anchor white space behind a model description string for regex.
-            # If metadata cannot be captured, the English version will be directly copied.
-            localized_model_index[title] = _re_capture_meta.sub(_rep, model + " ")
-        else:
-            # Synchronize link
-            localized_model_index[title] = _re_capture_title_link.sub(
-                f"**[{title}]({model_link})**", localized_model_index[title], count=1
-            )
-
-    sorted_index = sorted(localized_model_index.items(), key=lambda x: x[0].lower())
-
-    return readmes_match, "\n".join(map(lambda x: x[1], sorted_index)) + "\n"
-
-
-def convert_readme_to_index(model_list):
-    model_list = model_list.replace("https://huggingface.co/docs/diffusers/main/", "")
-    return model_list.replace("https://huggingface.co/docs/diffusers/", "")
-
-
-def _find_text_in_file(filename, start_prompt, end_prompt):
-    """
-    Find the text in `filename` between a line beginning with `start_prompt` and before `end_prompt`, removing empty
-    lines.
-    """
-    with open(filename, "r", encoding="utf-8", newline="\n") as f:
-        lines = f.readlines()
-    # Find the start prompt.
-    start_index = 0
-    while not lines[start_index].startswith(start_prompt):
-        start_index += 1
-    start_index += 1
-
-    end_index = start_index
-    while not lines[end_index].startswith(end_prompt):
-        end_index += 1
-    end_index -= 1
-
-    while len(lines[start_index]) <= 1:
-        start_index += 1
-    while len(lines[end_index]) <= 1:
-        end_index -= 1
-    end_index += 1
-    return "".join(lines[start_index:end_index]), start_index, end_index, lines
-
-
-def check_model_list_copy(overwrite=False, max_per_line=119):
-    """Check the model lists in the README and index.rst are consistent and maybe `overwrite`."""
-    # Fix potential doc links in the README
-    with open(os.path.join(REPO_PATH, "README.md"), "r", encoding="utf-8", newline="\n") as f:
-        readme = f.read()
-    new_readme = readme.replace("https://huggingface.co/diffusers", "https://huggingface.co/docs/diffusers")
-    new_readme = new_readme.replace(
-        "https://huggingface.co/docs/main/diffusers", "https://huggingface.co/docs/diffusers/main"
-    )
-    if new_readme != readme:
-        if overwrite:
-            with open(os.path.join(REPO_PATH, "README.md"), "w", encoding="utf-8", newline="\n") as f:
-                f.write(new_readme)
-        else:
-            raise ValueError(
-                "The main README contains wrong links to the documentation of Transformers. Run `make fix-copies` to "
-                "automatically fix them."
-            )
-
-    # If the introduction or the conclusion of the list change, the prompts may need to be updated.
-    index_list, start_index, end_index, lines = _find_text_in_file(
-        filename=os.path.join(PATH_TO_DOCS, "index.mdx"),
-        start_prompt="<!--This list is updated automatically from the README",
-        end_prompt="### Supported frameworks",
-    )
-    md_list = get_model_list(
-        filename="README.md",
-        start_prompt=LOCALIZED_READMES["README.md"]["start_prompt"],
-        end_prompt=LOCALIZED_READMES["README.md"]["end_prompt"],
-    )
-
-    converted_md_lists = []
-    for filename, value in LOCALIZED_READMES.items():
-        _start_prompt = value["start_prompt"]
-        _end_prompt = value["end_prompt"]
-        _format_model_list = value["format_model_list"]
-
-        localized_md_list = get_model_list(filename, _start_prompt, _end_prompt)
-        readmes_match, converted_md_list = convert_to_localized_md(md_list, localized_md_list, _format_model_list)
-
-        converted_md_lists.append((filename, readmes_match, converted_md_list, _start_prompt, _end_prompt))
-
-    converted_md_list = convert_readme_to_index(md_list)
-    if converted_md_list != index_list:
-        if overwrite:
-            with open(os.path.join(PATH_TO_DOCS, "index.mdx"), "w", encoding="utf-8", newline="\n") as f:
-                f.writelines(lines[:start_index] + [converted_md_list] + lines[end_index:])
-        else:
-            raise ValueError(
-                "The model list in the README changed and the list in `index.mdx` has not been updated. Run "
-                "`make fix-copies` to fix this."
-            )
-
-    for converted_md_list in converted_md_lists:
-        filename, readmes_match, converted_md, _start_prompt, _end_prompt = converted_md_list
-
-        if filename == "README.md":
-            continue
-        if overwrite:
-            _, start_index, end_index, lines = _find_text_in_file(
-                filename=os.path.join(REPO_PATH, filename), start_prompt=_start_prompt, end_prompt=_end_prompt
-            )
-            with open(os.path.join(REPO_PATH, filename), "w", encoding="utf-8", newline="\n") as f:
-                f.writelines(lines[:start_index] + [converted_md] + lines[end_index:])
-        elif not readmes_match:
-            raise ValueError(
-                f"The model list in the README changed and the list in `{filename}` has not been updated. Run "
-                "`make fix-copies` to fix this."
-            )
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
     args = parser.parse_args()
 
     check_copies(args.fix_and_overwrite)
-    check_full_copies(args.fix_and_overwrite)
diff --git a/utils/check_dummies.py b/utils/check_dummies.py
index 82d54b2a297a..c5664fa77a92 100644
--- a/utils/check_dummies.py
+++ b/utils/check_dummies.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
+# Copyright 2022 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -106,9 +106,10 @@ def create_dummy_object(name, backend_name):
         return DUMMY_CLASS.format(name, backend_name)
 
 
-def create_dummy_files():
+def create_dummy_files(backend_specific_objects=None):
     """Create the content of the dummy files."""
-    backend_specific_objects = read_init()
+    if backend_specific_objects is None:
+        backend_specific_objects = read_init()
     # For special correspondence backend to module name as used in the function requires_modulename
     dummy_files = {}
 

From 25dfd0f8dcc2ebf3e7fd5be651d5de9b417568d5 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 21 Oct 2022 12:49:52 +0200
Subject: [PATCH 027/269] [Tests] Move stable diffusion into their own files
 (#936)

* [Tests] Move stable diffusion into their own files

* up
---
 src/diffusers/pipeline_utils.py               |    2 +-
 .../stable_diffusion/test_stable_diffusion.py |  729 +++++++
 .../test_stable_diffusion_img2img.py          |  601 ++++++
 .../test_stable_diffusion_inpaint.py          |  384 ++++
 .../test_stable_diffusion_inpaint_legacy.py   |  491 +++++
 tests/test_pipelines.py                       | 1756 +----------------
 6 files changed, 2283 insertions(+), 1680 deletions(-)
 create mode 100644 tests/pipelines/stable_diffusion/test_stable_diffusion.py
 create mode 100644 tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
 create mode 100644 tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
 create mode 100644 tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py

diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 5d186f35ea11..d1e037499b65 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -122,7 +122,7 @@ def register_modules(self, **kwargs):
                 library = module.__module__.split(".")[0]
 
                 # check if the module is a pipeline module
-                pipeline_dir = module.__module__.split(".")[-2]
+                pipeline_dir = module.__module__.split(".")[-2] if len(module.__module__.split(".")) > 2 else None
                 path = module.__module__.split(".")
                 is_pipeline_module = pipeline_dir in path and hasattr(pipelines, pipeline_dir)
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
new file mode 100644
index 000000000000..e50a22dbaaf8
--- /dev/null
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -0,0 +1,729 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+    UNet2DModel,
+    VQModel,
+)
+from diffusers.utils import floats_tensor, load_image, slow, torch_device
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class PipelineFastTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @property
+    def dummy_image(self):
+        batch_size = 1
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
+        return image
+
+    @property
+    def dummy_uncond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
+        return model
+
+    @property
+    def dummy_cond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
+    @property
+    def dummy_cond_unet_inpaint(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=9,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
+    @property
+    def dummy_vq_model(self):
+        torch.manual_seed(0)
+        model = VQModel(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=3,
+        )
+        return model
+
+    @property
+    def dummy_vae(self):
+        torch.manual_seed(0)
+        model = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        return model
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModel(config)
+
+    @property
+    def dummy_extractor(self):
+        def extract(*args, **kwargs):
+            class Out:
+                def __init__(self):
+                    self.pixel_values = torch.ones([0])
+
+                def to(self, device):
+                    self.pixel_values.to(device)
+                    return self
+
+            return Out()
+
+        return extract
+
+    def test_stable_diffusion_ddim(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.5112, 0.4692, 0.4715, 0.5206, 0.4894, 0.5114, 0.5096, 0.4932, 0.4755])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_ddim_factor_8(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            height=536,
+            width=536,
+            num_inference_steps=2,
+            output_type="np",
+        )
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 134, 134, 3)
+        expected_slice = np.array([0.7834, 0.5488, 0.5781, 0.46, 0.3609, 0.5369, 0.542, 0.4855, 0.5557])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_pndm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.4937, 0.4649, 0.4716, 0.5145, 0.4889, 0.513, 0.513, 0.4905, 0.4738])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_no_safety_checker(self):
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
+        )
+        assert isinstance(pipe, StableDiffusionPipeline)
+        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
+        assert pipe.safety_checker is None
+
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None
+
+    def test_stable_diffusion_k_lms(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.5067, 0.4689, 0.4614, 0.5233, 0.4903, 0.5112, 0.524, 0.5069, 0.4785])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_attention_chunk(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output_1 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        # make sure chunking the attention yields the same result
+        sd_pipe.enable_attention_slicing(slice_size=1)
+        generator = torch.Generator(device=device).manual_seed(0)
+        output_2 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 1e-4
+
+    def test_stable_diffusion_negative_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        negative_prompt = "french fries"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe(
+            prompt,
+            negative_prompt=negative_prompt,
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.4851, 0.4617, 0.4765, 0.5127, 0.4845, 0.5153, 0.5141, 0.4886, 0.4719])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_num_images_per_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+
+        # test num_images_per_prompt=1 (default)
+        images = sd_pipe(prompt, num_inference_steps=2, output_type="np").images
+
+        assert images.shape == (1, 128, 128, 3)
+
+        # test num_images_per_prompt=1 (default) for batch of prompts
+        batch_size = 2
+        images = sd_pipe([prompt] * batch_size, num_inference_steps=2, output_type="np").images
+
+        assert images.shape == (batch_size, 128, 128, 3)
+
+        # test num_images_per_prompt for single prompt
+        num_images_per_prompt = 2
+        images = sd_pipe(
+            prompt, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
+        ).images
+
+        assert images.shape == (num_images_per_prompt, 128, 128, 3)
+
+        # test num_images_per_prompt for batch of prompts
+        batch_size = 2
+        images = sd_pipe(
+            [prompt] * batch_size, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
+        ).images
+
+        assert images.shape == (batch_size * num_images_per_prompt, 128, 128, 3)
+
+    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
+    def test_stable_diffusion_fp16(self):
+        """Test that stable diffusion works with fp16"""
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # put models in fp16
+        unet = unet.half()
+        vae = vae.half()
+        bert = bert.half()
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images
+
+        assert image.shape == (1, 128, 128, 3)
+
+
+@slow
+@unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
+class PipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_diffusion(self):
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        with torch.autocast("cuda"):
+            output = sd_pipe(
+                [prompt], generator=generator, guidance_scale=6.0, num_inference_steps=20, output_type="np"
+            )
+
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.8887, 0.915, 0.91, 0.894, 0.909, 0.912, 0.919, 0.925, 0.883])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_fast_ddim(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        sd_pipe.scheduler = scheduler
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+
+        with torch.autocast("cuda"):
+            output = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="numpy")
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.9326, 0.923, 0.951, 0.9365, 0.9214, 0.951, 0.9365, 0.9414, 0.918])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_lms_stable_diffusion_pipeline(self):
+        model_id = "CompVis/stable-diffusion-v1-1"
+        pipe = StableDiffusionPipeline.from_pretrained(model_id).to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        scheduler = LMSDiscreteScheduler.from_config(model_id, subfolder="scheduler")
+        pipe.scheduler = scheduler
+
+        prompt = "a photograph of an astronaut riding a horse"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = pipe(
+            [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
+        ).images
+
+        image_slice = image[0, -3:, -3:, -1]
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.9077, 0.9254, 0.9181, 0.9227, 0.9213, 0.9367, 0.9399, 0.9406, 0.9024])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_memory_chunking(self):
+        torch.cuda.reset_peak_memory_stats()
+        model_id = "CompVis/stable-diffusion-v1-4"
+        pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16).to(
+            torch_device
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "a photograph of an astronaut riding a horse"
+
+        # make attention efficient
+        pipe.enable_attention_slicing()
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        with torch.autocast(torch_device):
+            output_chunked = pipe(
+                [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
+            )
+            image_chunked = output_chunked.images
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+        # make sure that less than 3.75 GB is allocated
+        assert mem_bytes < 3.75 * 10**9
+
+        # disable chunking
+        pipe.disable_attention_slicing()
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        with torch.autocast(torch_device):
+            output = pipe(
+                [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
+            )
+            image = output.images
+
+        # make sure that more than 3.75 GB is allocated
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes > 3.75 * 10**9
+        assert np.abs(image_chunked.flatten() - image.flatten()).max() < 1e-3
+
+    def test_stable_diffusion_text2img_pipeline_fp16(self):
+        torch.cuda.reset_peak_memory_stats()
+        model_id = "CompVis/stable-diffusion-v1-4"
+        pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16).to(
+            torch_device
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "a photograph of an astronaut riding a horse"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output_chunked = pipe(
+            [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
+        )
+        image_chunked = output_chunked.images
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        with torch.autocast(torch_device):
+            output = pipe(
+                [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
+            )
+            image = output.images
+
+        # Make sure results are close enough
+        diff = np.abs(image_chunked.flatten() - image.flatten())
+        # They ARE different since ops are not run always at the same precision
+        # however, they should be extremely close.
+        assert diff.mean() < 2e-2
+
+    def test_stable_diffusion_text2img_pipeline(self):
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/text2img/astronaut_riding_a_horse.png"
+        )
+        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+
+        model_id = "CompVis/stable-diffusion-v1-4"
+        pipe = StableDiffusionPipeline.from_pretrained(
+            model_id,
+            safety_checker=None,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "astronaut riding a horse"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(prompt=prompt, strength=0.75, guidance_scale=7.5, generator=generator, output_type="np")
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 1e-2
+
+    def test_stable_diffusion_text2img_intermediate_state(self):
+        number_of_steps = 0
+
+        def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
+            test_callback_fn.has_been_called = True
+            nonlocal number_of_steps
+            number_of_steps += 1
+            if step == 0:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array(
+                    [1.8285, 1.2857, -0.1024, 1.2406, -2.3068, 1.0747, -0.0818, -0.6520, -2.9506]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
+            elif step == 50:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array(
+                    [1.1078, 1.5803, 0.2773, -0.0589, -1.7928, -0.3665, -0.4695, -1.0727, -1.1601]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-2
+
+        test_callback_fn.has_been_called = False
+
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
+        )
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "Andromeda galaxy in a bottle"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        with torch.autocast(torch_device):
+            pipe(
+                prompt=prompt,
+                num_inference_steps=50,
+                guidance_scale=7.5,
+                generator=generator,
+                callback=test_callback_fn,
+                callback_steps=1,
+            )
+        assert test_callback_fn.has_been_called
+        assert number_of_steps == 51
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
new file mode 100644
index 000000000000..473014fa2ea7
--- /dev/null
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -0,0 +1,601 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import (
+    AutoencoderKL,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionImg2ImgPipeline,
+    UNet2DConditionModel,
+    UNet2DModel,
+    VQModel,
+)
+from diffusers.utils import floats_tensor, load_image, slow, torch_device
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class PipelineFastTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @property
+    def dummy_image(self):
+        batch_size = 1
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
+        return image
+
+    @property
+    def dummy_uncond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
+        return model
+
+    @property
+    def dummy_cond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
+    @property
+    def dummy_cond_unet_inpaint(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=9,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
+    @property
+    def dummy_vq_model(self):
+        torch.manual_seed(0)
+        model = VQModel(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=3,
+        )
+        return model
+
+    @property
+    def dummy_vae(self):
+        torch.manual_seed(0)
+        model = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        return model
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModel(config)
+
+    @property
+    def dummy_extractor(self):
+        def extract(*args, **kwargs):
+            class Out:
+                def __init__(self):
+                    self.pixel_values = torch.ones([0])
+
+                def to(self, device):
+                    self.pixel_values.to(device)
+                    return self
+
+            return Out()
+
+        return extract
+
+    def test_stable_diffusion_img2img(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        init_image = self.dummy_image.to(device)
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionImg2ImgPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+        )
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.4492, 0.3865, 0.4222, 0.5854, 0.5139, 0.4379, 0.4193, 0.48, 0.4218])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_img2img_negative_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        init_image = self.dummy_image.to(device)
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionImg2ImgPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        negative_prompt = "french fries"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe(
+            prompt,
+            negative_prompt=negative_prompt,
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+        )
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.4065, 0.3783, 0.4050, 0.5266, 0.4781, 0.4252, 0.4203, 0.4692, 0.4365])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_img2img_multiple_init_images(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        init_image = self.dummy_image.to(device).repeat(2, 1, 1, 1)
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionImg2ImgPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = 2 * ["A painting of a squirrel eating a burger"]
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe(
+            prompt,
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+        )
+
+        image = output.images
+
+        image_slice = image[-1, -3:, -3:, -1]
+
+        assert image.shape == (2, 32, 32, 3)
+        expected_slice = np.array([0.5144, 0.4447, 0.4735, 0.6676, 0.5526, 0.5454, 0.645, 0.5149, 0.4689])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_img2img_k_lms(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        init_image = self.dummy_image.to(device)
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionImg2ImgPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+        )
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+            return_dict=False,
+        )
+        image_from_tuple = output[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.4367, 0.4986, 0.4372, 0.6706, 0.5665, 0.444, 0.5864, 0.6019, 0.5203])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_img2img_num_images_per_prompt(self):
+        device = "cpu"
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        init_image = self.dummy_image.to(device)
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionImg2ImgPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+
+        # test num_images_per_prompt=1 (default)
+        images = sd_pipe(
+            prompt,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+        ).images
+
+        assert images.shape == (1, 32, 32, 3)
+
+        # test num_images_per_prompt=1 (default) for batch of prompts
+        batch_size = 2
+        images = sd_pipe(
+            [prompt] * batch_size,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+        ).images
+
+        assert images.shape == (batch_size, 32, 32, 3)
+
+        # test num_images_per_prompt for single prompt
+        num_images_per_prompt = 2
+        images = sd_pipe(
+            prompt,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+            num_images_per_prompt=num_images_per_prompt,
+        ).images
+
+        assert images.shape == (num_images_per_prompt, 32, 32, 3)
+
+        # test num_images_per_prompt for batch of prompts
+        batch_size = 2
+        images = sd_pipe(
+            [prompt] * batch_size,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+            num_images_per_prompt=num_images_per_prompt,
+        ).images
+
+        assert images.shape == (batch_size * num_images_per_prompt, 32, 32, 3)
+
+    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
+    def test_stable_diffusion_img2img_fp16(self):
+        """Test that stable diffusion img2img works with fp16"""
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        init_image = self.dummy_image.to(torch_device)
+
+        # put models in fp16
+        unet = unet.half()
+        vae = vae.half()
+        bert = bert.half()
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionImg2ImgPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = sd_pipe(
+            [prompt],
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+        ).images
+
+        assert image.shape == (1, 32, 32, 3)
+
+
+@slow
+@unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
+class PipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_diffusion_img2img_pipeline(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        )
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/fantasy_landscape.png"
+        )
+        init_image = init_image.resize((768, 512))
+        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+
+        model_id = "CompVis/stable-diffusion-v1-4"
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
+            model_id,
+            safety_checker=None,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "A fantasy landscape, trending on artstation"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            init_image=init_image,
+            strength=0.75,
+            guidance_scale=7.5,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 768, 3)
+        # img2img is flaky across GPUs even in fp32, so using MAE here
+        assert np.abs(expected_image - image).mean() < 1e-2
+
+    def test_stable_diffusion_img2img_pipeline_k_lms(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        )
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/fantasy_landscape_k_lms.png"
+        )
+        init_image = init_image.resize((768, 512))
+        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+
+        lms = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+
+        model_id = "CompVis/stable-diffusion-v1-4"
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
+            model_id,
+            scheduler=lms,
+            safety_checker=None,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "A fantasy landscape, trending on artstation"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            init_image=init_image,
+            strength=0.75,
+            guidance_scale=7.5,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 768, 3)
+        # img2img is flaky across GPUs even in fp32, so using MAE here
+        assert np.abs(expected_image - image).mean() < 1e-2
+
+    def test_stable_diffusion_img2img_intermediate_state(self):
+        number_of_steps = 0
+
+        def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
+            test_callback_fn.has_been_called = True
+            nonlocal number_of_steps
+            number_of_steps += 1
+            if step == 0:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 96)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array([0.9052, -0.0184, 0.4810, 0.2898, 0.5851, 1.4920, 0.5362, 1.9838, 0.0530])
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
+            elif step == 37:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 96)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array([0.7071, 0.7831, 0.8300, 1.8140, 1.7840, 1.9402, 1.3651, 1.6590, 1.2828])
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-2
+
+        test_callback_fn.has_been_called = False
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        )
+        init_image = init_image.resize((768, 512))
+
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "A fantasy landscape, trending on artstation"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        with torch.autocast(torch_device):
+            pipe(
+                prompt=prompt,
+                init_image=init_image,
+                strength=0.75,
+                num_inference_steps=50,
+                guidance_scale=7.5,
+                generator=generator,
+                callback=test_callback_fn,
+                callback_steps=1,
+            )
+        assert test_callback_fn.has_been_called
+        assert number_of_steps == 38
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
new file mode 100644
index 000000000000..5244bda1cbb0
--- /dev/null
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -0,0 +1,384 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import (
+    AutoencoderKL,
+    PNDMScheduler,
+    StableDiffusionInpaintPipeline,
+    UNet2DConditionModel,
+    UNet2DModel,
+    VQModel,
+)
+from diffusers.utils import floats_tensor, load_image, slow, torch_device
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class PipelineFastTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @property
+    def dummy_image(self):
+        batch_size = 1
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
+        return image
+
+    @property
+    def dummy_uncond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
+        return model
+
+    @property
+    def dummy_cond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
+    @property
+    def dummy_cond_unet_inpaint(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=9,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
+    @property
+    def dummy_vq_model(self):
+        torch.manual_seed(0)
+        model = VQModel(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=3,
+        )
+        return model
+
+    @property
+    def dummy_vae(self):
+        torch.manual_seed(0)
+        model = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        return model
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModel(config)
+
+    @property
+    def dummy_extractor(self):
+        def extract(*args, **kwargs):
+            class Out:
+                def __init__(self):
+                    self.pixel_values = torch.ones([0])
+
+                def to(self, device):
+                    self.pixel_values.to(device)
+                    return self
+
+            return Out()
+
+        return extract
+
+    def test_stable_diffusion_inpaint(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet_inpaint
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((128, 128))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionInpaintPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=None,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            image=init_image,
+            mask_image=mask_image,
+        )
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            image=init_image,
+            mask_image=mask_image,
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.5075, 0.4485, 0.4558, 0.5369, 0.5369, 0.5236, 0.5127, 0.4983, 0.4776])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
+    def test_stable_diffusion_inpaint_fp16(self):
+        """Test that stable diffusion inpaint_legacy works with fp16"""
+        unet = self.dummy_cond_unet_inpaint
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((128, 128))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
+
+        # put models in fp16
+        unet = unet.half()
+        vae = vae.half()
+        bert = bert.half()
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionInpaintPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=None,
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = sd_pipe(
+            [prompt],
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+            image=init_image,
+            mask_image=mask_image,
+        ).images
+
+        assert image.shape == (1, 128, 128, 3)
+
+
+@slow
+@unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
+class PipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_diffusion_inpaint_pipeline(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
+        )
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/yellow_cat_sitting_on_a_park_bench.png"
+        )
+        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+
+        model_id = "runwayml/stable-diffusion-inpainting"
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            model_id,
+            safety_checker=None,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            mask_image=mask_image,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 1e-2
+
+    def test_stable_diffusion_inpaint_pipeline_fp16(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
+        )
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/yellow_cat_sitting_on_a_park_bench_fp16.png"
+        )
+        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+
+        model_id = "runwayml/stable-diffusion-inpainting"
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            model_id,
+            revision="fp16",
+            torch_dtype=torch.float16,
+            safety_checker=None,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            mask_image=mask_image,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 1e-2
+
+    def test_stable_diffusion_inpaint_pipeline_pndm(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
+        )
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/yellow_cat_sitting_on_a_park_bench_pndm.png"
+        )
+        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+
+        pndm = PNDMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True)
+        model_id = "runwayml/stable-diffusion-inpainting"
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None, scheduler=pndm)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            mask_image=mask_image,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 1e-2
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
new file mode 100644
index 000000000000..3259ed49f4f2
--- /dev/null
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
@@ -0,0 +1,491 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import (
+    AutoencoderKL,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionInpaintPipeline,
+    StableDiffusionInpaintPipelineLegacy,
+    UNet2DConditionModel,
+    UNet2DModel,
+    VQModel,
+)
+from diffusers.utils import floats_tensor, load_image, slow, torch_device
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class PipelineFastTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @property
+    def dummy_image(self):
+        batch_size = 1
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
+        return image
+
+    @property
+    def dummy_uncond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
+        return model
+
+    @property
+    def dummy_cond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
+    @property
+    def dummy_cond_unet_inpaint(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=9,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
+    @property
+    def dummy_vq_model(self):
+        torch.manual_seed(0)
+        model = VQModel(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=3,
+        )
+        return model
+
+    @property
+    def dummy_vae(self):
+        torch.manual_seed(0)
+        model = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        return model
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModel(config)
+
+    @property
+    def dummy_extractor(self):
+        def extract(*args, **kwargs):
+            class Out:
+                def __init__(self):
+                    self.pixel_values = torch.ones([0])
+
+                def to(self, device):
+                    self.pixel_values.to(device)
+                    return self
+
+            return Out()
+
+        return extract
+
+    def test_stable_diffusion_inpaint_legacy(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionInpaintPipelineLegacy(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+            mask_image=mask_image,
+        )
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+            mask_image=mask_image,
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.4731, 0.5346, 0.4531, 0.6251, 0.5446, 0.4057, 0.5527, 0.5896, 0.5153])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_inpaint_legacy_negative_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionInpaintPipelineLegacy(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        negative_prompt = "french fries"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe(
+            prompt,
+            negative_prompt=negative_prompt,
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+            mask_image=mask_image,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.4765, 0.5339, 0.4541, 0.6240, 0.5439, 0.4055, 0.5503, 0.5891, 0.5150])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
+        device = "cpu"
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionInpaintPipelineLegacy(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+
+        # test num_images_per_prompt=1 (default)
+        images = sd_pipe(
+            prompt,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+            mask_image=mask_image,
+        ).images
+
+        assert images.shape == (1, 32, 32, 3)
+
+        # test num_images_per_prompt=1 (default) for batch of prompts
+        batch_size = 2
+        images = sd_pipe(
+            [prompt] * batch_size,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+            mask_image=mask_image,
+        ).images
+
+        assert images.shape == (batch_size, 32, 32, 3)
+
+        # test num_images_per_prompt for single prompt
+        num_images_per_prompt = 2
+        images = sd_pipe(
+            prompt,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+            mask_image=mask_image,
+            num_images_per_prompt=num_images_per_prompt,
+        ).images
+
+        assert images.shape == (num_images_per_prompt, 32, 32, 3)
+
+        # test num_images_per_prompt for batch of prompts
+        batch_size = 2
+        images = sd_pipe(
+            [prompt] * batch_size,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+            mask_image=mask_image,
+            num_images_per_prompt=num_images_per_prompt,
+        ).images
+
+        assert images.shape == (batch_size * num_images_per_prompt, 32, 32, 3)
+
+
+@slow
+@unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
+class PipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_diffusion_inpaint_legacy_pipeline(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
+        )
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/red_cat_sitting_on_a_park_bench.png"
+        )
+        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+
+        model_id = "CompVis/stable-diffusion-v1-4"
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            model_id,
+            safety_checker=None,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "A red cat sitting on a park bench"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            init_image=init_image,
+            mask_image=mask_image,
+            strength=0.75,
+            guidance_scale=7.5,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 1e-2
+
+    def test_stable_diffusion_inpaint_legacy_pipeline_k_lms(self):
+        # TODO(Anton, Patrick) - I think we can remove this test soon
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
+        )
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/red_cat_sitting_on_a_park_bench_k_lms.png"
+        )
+        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+
+        lms = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+
+        model_id = "CompVis/stable-diffusion-v1-4"
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            model_id,
+            scheduler=lms,
+            safety_checker=None,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "A red cat sitting on a park bench"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            init_image=init_image,
+            mask_image=mask_image,
+            strength=0.75,
+            guidance_scale=7.5,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 1e-2
+
+    def test_stable_diffusion_inpaint_legacy_intermediate_state(self):
+        number_of_steps = 0
+
+        def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
+            test_callback_fn.has_been_called = True
+            nonlocal number_of_steps
+            number_of_steps += 1
+            if step == 0:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array(
+                    [-0.5472, 1.1218, -0.5505, -0.9390, -1.0794, 0.4063, 0.5158, 0.6429, -1.5246]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
+            elif step == 37:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array([0.4781, 1.1572, 0.6258, 0.2291, 0.2554, -0.1443, 0.7085, -0.1598, -0.5659])
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
+
+        test_callback_fn.has_been_called = False
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
+        )
+
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "A red cat sitting on a park bench"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        with torch.autocast(torch_device):
+            pipe(
+                prompt=prompt,
+                init_image=init_image,
+                mask_image=mask_image,
+                strength=0.75,
+                num_inference_steps=50,
+                guidance_scale=7.5,
+                generator=generator,
+                callback=test_callback_fn,
+                callback_steps=1,
+            )
+        assert test_callback_fn.has_been_called
+        assert number_of_steps == 38
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index bbda797bf694..9d1c41eee93c 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -36,7 +36,6 @@
     KarrasVeScheduler,
     LDMPipeline,
     LDMTextToImagePipeline,
-    LMSDiscreteScheduler,
     OnnxStableDiffusionImg2ImgPipeline,
     OnnxStableDiffusionInpaintPipeline,
     OnnxStableDiffusionPipeline,
@@ -45,7 +44,6 @@
     ScoreSdeVePipeline,
     ScoreSdeVeScheduler,
     StableDiffusionImg2ImgPipeline,
-    StableDiffusionInpaintPipeline,
     StableDiffusionInpaintPipelineLegacy,
     StableDiffusionPipeline,
     UNet2DConditionModel,
@@ -248,13 +246,6 @@ def dummy_text_encoder(self):
         )
         return CLIPTextModel(config)
 
-    @property
-    def dummy_safety_checker(self):
-        def check(images, *args, **kwargs):
-            return images, [False] * len(images)
-
-        return check
-
     @property
     def dummy_extractor(self):
         def extract(*args, **kwargs):
@@ -364,287 +355,6 @@ def test_ldm_text2img(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
-    def test_stable_diffusion_ddim(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=self.dummy_safety_checker,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 128, 128, 3)
-        expected_slice = np.array([0.5112, 0.4692, 0.4715, 0.5206, 0.4894, 0.5114, 0.5096, 0.4932, 0.4755])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_ddim_factor_8(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=self.dummy_safety_checker,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            height=536,
-            width=536,
-            num_inference_steps=2,
-            output_type="np",
-        )
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 134, 134, 3)
-        expected_slice = np.array([0.7834, 0.5488, 0.5781, 0.46, 0.3609, 0.5369, 0.542, 0.4855, 0.5557])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_pndm(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=self.dummy_safety_checker,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 128, 128, 3)
-        expected_slice = np.array([0.4937, 0.4649, 0.4716, 0.5145, 0.4889, 0.513, 0.513, 0.4905, 0.4738])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_from_pretrained_error_message_uninstalled_packages(self):
-        # TODO(Patrick, Pedro) - need better test here for the future
-        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-lms-pipe")
-        assert isinstance(pipe, StableDiffusionPipeline)
-        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
-
-    def test_stable_diffusion_no_safety_checker(self):
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
-        )
-        assert isinstance(pipe, StableDiffusionPipeline)
-        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
-        assert pipe.safety_checker is None
-
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
-
-    def test_stable_diffusion_k_lms(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=self.dummy_safety_checker,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 128, 128, 3)
-        expected_slice = np.array([0.5067, 0.4689, 0.4614, 0.5233, 0.4903, 0.5112, 0.524, 0.5069, 0.4785])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_attention_chunk(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=self.dummy_safety_checker,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=device).manual_seed(0)
-        output_1 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-
-        # make sure chunking the attention yields the same result
-        sd_pipe.enable_attention_slicing(slice_size=1)
-        generator = torch.Generator(device=device).manual_seed(0)
-        output_2 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-
-        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 1e-4
-
-    def test_stable_diffusion_negative_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=self.dummy_safety_checker,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        negative_prompt = "french fries"
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe(
-            prompt,
-            negative_prompt=negative_prompt,
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-        )
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 128, 128, 3)
-        expected_slice = np.array([0.4851, 0.4617, 0.4765, 0.5127, 0.4845, 0.5153, 0.5141, 0.4886, 0.4719])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
     def test_score_sde_ve_pipeline(self):
         unet = self.dummy_uncond_unet
         scheduler = ScoreSdeVeScheduler()
@@ -719,731 +429,58 @@ def test_karras_ve_pipeline(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
-    def test_stable_diffusion_img2img(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+    def test_components(self):
+        """Test that components property works correctly"""
         unet = self.dummy_cond_unet
         scheduler = PNDMScheduler(skip_prk_steps=True)
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
         tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
 
-        init_image = self.dummy_image.to(device)
+        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
 
         # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionImg2ImgPipeline(
+        inpaint = StableDiffusionInpaintPipelineLegacy(
             unet=unet,
             scheduler=scheduler,
             vae=vae,
             text_encoder=bert,
             tokenizer=tokenizer,
-            safety_checker=self.dummy_safety_checker,
+            safety_checker=None,
             feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
+        ).to(torch_device)
+        img2img = StableDiffusionImg2ImgPipeline(**inpaint.components).to(torch_device)
+        text2img = StableDiffusionPipeline(**inpaint.components).to(torch_device)
 
         prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe(
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image_inpaint = inpaint(
             [prompt],
             generator=generator,
-            guidance_scale=6.0,
             num_inference_steps=2,
             output_type="np",
             init_image=init_image,
-        )
-
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = sd_pipe(
+            mask_image=mask_image,
+        ).images
+        image_img2img = img2img(
             [prompt],
             generator=generator,
-            guidance_scale=6.0,
             num_inference_steps=2,
             output_type="np",
             init_image=init_image,
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4492, 0.3865, 0.4222, 0.5854, 0.5139, 0.4379, 0.4193, 0.48, 0.4218])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_img2img_negative_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        init_image = self.dummy_image.to(device)
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionImg2ImgPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=self.dummy_safety_checker,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        negative_prompt = "french fries"
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe(
-            prompt,
-            negative_prompt=negative_prompt,
+        ).images
+        image_text2img = text2img(
+            [prompt],
             generator=generator,
-            guidance_scale=6.0,
             num_inference_steps=2,
             output_type="np",
-            init_image=init_image,
-        )
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4065, 0.3783, 0.4050, 0.5266, 0.4781, 0.4252, 0.4203, 0.4692, 0.4365])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        ).images
 
-    def test_stable_diffusion_img2img_multiple_init_images(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        init_image = self.dummy_image.to(device).repeat(2, 1, 1, 1)
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionImg2ImgPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=self.dummy_safety_checker,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = 2 * ["A painting of a squirrel eating a burger"]
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe(
-            prompt,
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            init_image=init_image,
-        )
-
-        image = output.images
-
-        image_slice = image[-1, -3:, -3:, -1]
-
-        assert image.shape == (2, 32, 32, 3)
-        expected_slice = np.array([0.5144, 0.4447, 0.4735, 0.6676, 0.5526, 0.5454, 0.645, 0.5149, 0.4689])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_img2img_k_lms(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
-
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        init_image = self.dummy_image.to(device)
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionImg2ImgPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=self.dummy_safety_checker,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            init_image=init_image,
-        )
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            init_image=init_image,
-            return_dict=False,
-        )
-        image_from_tuple = output[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4367, 0.4986, 0.4372, 0.6706, 0.5665, 0.444, 0.5864, 0.6019, 0.5203])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_inpaint_legacy(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionInpaintPipelineLegacy(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=self.dummy_safety_checker,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            init_image=init_image,
-            mask_image=mask_image,
-        )
-
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            init_image=init_image,
-            mask_image=mask_image,
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4731, 0.5346, 0.4531, 0.6251, 0.5446, 0.4057, 0.5527, 0.5896, 0.5153])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_inpaint(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet_inpaint
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((128, 128))
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionInpaintPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=None,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            mask_image=mask_image,
-        )
-
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            mask_image=mask_image,
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 128, 128, 3)
-        expected_slice = np.array([0.5075, 0.4485, 0.4558, 0.5369, 0.5369, 0.5236, 0.5127, 0.4983, 0.4776])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_inpaint_legacy_negative_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionInpaintPipelineLegacy(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=self.dummy_safety_checker,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        negative_prompt = "french fries"
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe(
-            prompt,
-            negative_prompt=negative_prompt,
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            init_image=init_image,
-            mask_image=mask_image,
-        )
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4765, 0.5339, 0.4541, 0.6240, 0.5439, 0.4055, 0.5503, 0.5891, 0.5150])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_num_images_per_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=self.dummy_safety_checker,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-
-        # test num_images_per_prompt=1 (default)
-        images = sd_pipe(prompt, num_inference_steps=2, output_type="np").images
-
-        assert images.shape == (1, 128, 128, 3)
-
-        # test num_images_per_prompt=1 (default) for batch of prompts
-        batch_size = 2
-        images = sd_pipe([prompt] * batch_size, num_inference_steps=2, output_type="np").images
-
-        assert images.shape == (batch_size, 128, 128, 3)
-
-        # test num_images_per_prompt for single prompt
-        num_images_per_prompt = 2
-        images = sd_pipe(
-            prompt, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
-        ).images
-
-        assert images.shape == (num_images_per_prompt, 128, 128, 3)
-
-        # test num_images_per_prompt for batch of prompts
-        batch_size = 2
-        images = sd_pipe(
-            [prompt] * batch_size, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
-        ).images
-
-        assert images.shape == (batch_size * num_images_per_prompt, 128, 128, 3)
-
-    def test_stable_diffusion_img2img_num_images_per_prompt(self):
-        device = "cpu"
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        init_image = self.dummy_image.to(device)
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionImg2ImgPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=self.dummy_safety_checker,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-
-        # test num_images_per_prompt=1 (default)
-        images = sd_pipe(
-            prompt,
-            num_inference_steps=2,
-            output_type="np",
-            init_image=init_image,
-        ).images
-
-        assert images.shape == (1, 32, 32, 3)
-
-        # test num_images_per_prompt=1 (default) for batch of prompts
-        batch_size = 2
-        images = sd_pipe(
-            [prompt] * batch_size,
-            num_inference_steps=2,
-            output_type="np",
-            init_image=init_image,
-        ).images
-
-        assert images.shape == (batch_size, 32, 32, 3)
-
-        # test num_images_per_prompt for single prompt
-        num_images_per_prompt = 2
-        images = sd_pipe(
-            prompt,
-            num_inference_steps=2,
-            output_type="np",
-            init_image=init_image,
-            num_images_per_prompt=num_images_per_prompt,
-        ).images
-
-        assert images.shape == (num_images_per_prompt, 32, 32, 3)
-
-        # test num_images_per_prompt for batch of prompts
-        batch_size = 2
-        images = sd_pipe(
-            [prompt] * batch_size,
-            num_inference_steps=2,
-            output_type="np",
-            init_image=init_image,
-            num_images_per_prompt=num_images_per_prompt,
-        ).images
-
-        assert images.shape == (batch_size * num_images_per_prompt, 32, 32, 3)
-
-    def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
-        device = "cpu"
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionInpaintPipelineLegacy(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=self.dummy_safety_checker,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-
-        # test num_images_per_prompt=1 (default)
-        images = sd_pipe(
-            prompt,
-            num_inference_steps=2,
-            output_type="np",
-            init_image=init_image,
-            mask_image=mask_image,
-        ).images
-
-        assert images.shape == (1, 32, 32, 3)
-
-        # test num_images_per_prompt=1 (default) for batch of prompts
-        batch_size = 2
-        images = sd_pipe(
-            [prompt] * batch_size,
-            num_inference_steps=2,
-            output_type="np",
-            init_image=init_image,
-            mask_image=mask_image,
-        ).images
-
-        assert images.shape == (batch_size, 32, 32, 3)
-
-        # test num_images_per_prompt for single prompt
-        num_images_per_prompt = 2
-        images = sd_pipe(
-            prompt,
-            num_inference_steps=2,
-            output_type="np",
-            init_image=init_image,
-            mask_image=mask_image,
-            num_images_per_prompt=num_images_per_prompt,
-        ).images
-
-        assert images.shape == (num_images_per_prompt, 32, 32, 3)
-
-        # test num_images_per_prompt for batch of prompts
-        batch_size = 2
-        images = sd_pipe(
-            [prompt] * batch_size,
-            num_inference_steps=2,
-            output_type="np",
-            init_image=init_image,
-            mask_image=mask_image,
-            num_images_per_prompt=num_images_per_prompt,
-        ).images
-
-        assert images.shape == (batch_size * num_images_per_prompt, 32, 32, 3)
-
-    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
-    def test_stable_diffusion_fp16(self):
-        """Test that stable diffusion works with fp16"""
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        # put models in fp16
-        unet = unet.half()
-        vae = vae.half()
-        bert = bert.half()
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=self.dummy_safety_checker,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images
-
-        assert image.shape == (1, 128, 128, 3)
-
-    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
-    def test_stable_diffusion_img2img_fp16(self):
-        """Test that stable diffusion img2img works with fp16"""
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        init_image = self.dummy_image.to(torch_device)
-
-        # put models in fp16
-        unet = unet.half()
-        vae = vae.half()
-        bert = bert.half()
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionImg2ImgPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=self.dummy_safety_checker,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        image = sd_pipe(
-            [prompt],
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-            init_image=init_image,
-        ).images
-
-        assert image.shape == (1, 32, 32, 3)
-
-    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
-    def test_stable_diffusion_inpaint_fp16(self):
-        """Test that stable diffusion inpaint_legacy works with fp16"""
-        unet = self.dummy_cond_unet_inpaint
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((128, 128))
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
-
-        # put models in fp16
-        unet = unet.half()
-        vae = vae.half()
-        bert = bert.half()
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionInpaintPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=None,
-        )
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        image = sd_pipe(
-            [prompt],
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            mask_image=mask_image,
-        ).images
-
-        assert image.shape == (1, 128, 128, 3)
-
-    def test_components(self):
-        """Test that components property works correctly"""
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
-
-        # make sure here that pndm scheduler skips prk
-        inpaint = StableDiffusionInpaintPipelineLegacy(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=self.dummy_safety_checker,
-            feature_extractor=self.dummy_extractor,
-        ).to(torch_device)
-        img2img = StableDiffusionImg2ImgPipeline(**inpaint.components).to(torch_device)
-        text2img = StableDiffusionPipeline(**inpaint.components).to(torch_device)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        image_inpaint = inpaint(
-            [prompt],
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-            init_image=init_image,
-            mask_image=mask_image,
-        ).images
-        image_img2img = img2img(
-            [prompt],
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-            init_image=init_image,
-        ).images
-        image_text2img = text2img(
-            [prompt],
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-        ).images
-
-        assert image_inpaint.shape == (1, 32, 32, 3)
-        assert image_img2img.shape == (1, 32, 32, 3)
-        assert image_text2img.shape == (1, 128, 128, 3)
+        assert image_inpaint.shape == (1, 32, 32, 3)
+        assert image_img2img.shape == (1, 32, 32, 3)
+        assert image_text2img.shape == (1, 128, 128, 3)
 
 
 class PipelineTesterMixin(unittest.TestCase):
@@ -1483,13 +520,6 @@ def test_warning_unused_kwargs(self):
 
         assert cap_logger.out == "Keyword arguments {'not_used': True} not recognized.\n"
 
-    @property
-    def dummy_safety_checker(self):
-        def check(images, *args, **kwargs):
-            return images, [False] * len(images)
-
-        return check
-
     def test_from_pretrained_save_pretrained(self):
         # 1. Load models
         model = UNet2DModel(
@@ -1701,58 +731,6 @@ def test_ldm_text2img_fast(self):
         expected_slice = np.array([0.3163, 0.8670, 0.6465, 0.1865, 0.6291, 0.5139, 0.2824, 0.3723, 0.4344])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
-    @slow
-    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-    def test_stable_diffusion(self):
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        with torch.autocast("cuda"):
-            output = sd_pipe(
-                [prompt], generator=generator, guidance_scale=6.0, num_inference_steps=20, output_type="np"
-            )
-
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.8887, 0.915, 0.91, 0.894, 0.909, 0.912, 0.919, 0.925, 0.883])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
-    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-    def test_stable_diffusion_fast_ddim(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        sd_pipe.scheduler = scheduler
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-
-        with torch.autocast("cuda"):
-            output = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="numpy")
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.9326, 0.923, 0.951, 0.9365, 0.9214, 0.951, 0.9365, 0.9414, 0.918])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
     @slow
     def test_score_sde_ve_pipeline(self):
         model_id = "google/ncsnpp-church-256"
@@ -1780,499 +758,83 @@ def test_ldm_uncond(self):
         ldm.to(torch_device)
         ldm.set_progress_bar_config(disable=None)
 
-        generator = torch.manual_seed(0)
-        image = ldm(generator=generator, num_inference_steps=5, output_type="numpy").images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.4399, 0.44975, 0.46825, 0.474, 0.4359, 0.4581, 0.45095, 0.4341, 0.4447])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
-    def test_ddpm_ddim_equality(self):
-        model_id = "google/ddpm-cifar10-32"
-
-        unet = UNet2DModel.from_pretrained(model_id)
-        ddpm_scheduler = DDPMScheduler()
-        ddim_scheduler = DDIMScheduler()
-
-        ddpm = DDPMPipeline(unet=unet, scheduler=ddpm_scheduler)
-        ddpm.to(torch_device)
-        ddpm.set_progress_bar_config(disable=None)
-        ddim = DDIMPipeline(unet=unet, scheduler=ddim_scheduler)
-        ddim.to(torch_device)
-        ddim.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        ddpm_image = ddpm(generator=generator, output_type="numpy").images
-
-        generator = torch.manual_seed(0)
-        ddim_image = ddim(generator=generator, num_inference_steps=1000, eta=1.0, output_type="numpy").images
-
-        # the values aren't exactly equal, but the images look the same visually
-        assert np.abs(ddpm_image - ddim_image).max() < 1e-1
-
-    @unittest.skip("(Anton) The test is failing for large batch sizes, needs investigation")
-    def test_ddpm_ddim_equality_batched(self):
-        model_id = "google/ddpm-cifar10-32"
-
-        unet = UNet2DModel.from_pretrained(model_id)
-        ddpm_scheduler = DDPMScheduler()
-        ddim_scheduler = DDIMScheduler()
-
-        ddpm = DDPMPipeline(unet=unet, scheduler=ddpm_scheduler)
-        ddpm.to(torch_device)
-        ddpm.set_progress_bar_config(disable=None)
-
-        ddim = DDIMPipeline(unet=unet, scheduler=ddim_scheduler)
-        ddim.to(torch_device)
-        ddim.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        ddpm_images = ddpm(batch_size=4, generator=generator, output_type="numpy").images
-
-        generator = torch.manual_seed(0)
-        ddim_images = ddim(
-            batch_size=4, generator=generator, num_inference_steps=1000, eta=1.0, output_type="numpy"
-        ).images
-
-        # the values aren't exactly equal, but the images look the same visually
-        assert np.abs(ddpm_images - ddim_images).max() < 1e-1
-
-    @slow
-    def test_karras_ve_pipeline(self):
-        model_id = "google/ncsnpp-celebahq-256"
-        model = UNet2DModel.from_pretrained(model_id)
-        scheduler = KarrasVeScheduler()
-
-        pipe = KarrasVePipeline(unet=model, scheduler=scheduler)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = pipe(num_inference_steps=20, generator=generator, output_type="numpy").images
-
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.578, 0.5811, 0.5924, 0.5809, 0.587, 0.5886, 0.5861, 0.5802, 0.586])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
-    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-    def test_lms_stable_diffusion_pipeline(self):
-        model_id = "CompVis/stable-diffusion-v1-1"
-        pipe = StableDiffusionPipeline.from_pretrained(model_id).to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        scheduler = LMSDiscreteScheduler.from_config(model_id, subfolder="scheduler")
-        pipe.scheduler = scheduler
-
-        prompt = "a photograph of an astronaut riding a horse"
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        image = pipe(
-            [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
-        ).images
-
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.9077, 0.9254, 0.9181, 0.9227, 0.9213, 0.9367, 0.9399, 0.9406, 0.9024])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
-    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-    def test_stable_diffusion_memory_chunking(self):
-        torch.cuda.reset_peak_memory_stats()
-        model_id = "CompVis/stable-diffusion-v1-4"
-        pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16).to(
-            torch_device
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "a photograph of an astronaut riding a horse"
-
-        # make attention efficient
-        pipe.enable_attention_slicing()
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        with torch.autocast(torch_device):
-            output_chunked = pipe(
-                [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
-            )
-            image_chunked = output_chunked.images
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-        # make sure that less than 3.75 GB is allocated
-        assert mem_bytes < 3.75 * 10**9
-
-        # disable chunking
-        pipe.disable_attention_slicing()
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        with torch.autocast(torch_device):
-            output = pipe(
-                [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
-            )
-            image = output.images
-
-        # make sure that more than 3.75 GB is allocated
-        mem_bytes = torch.cuda.max_memory_allocated()
-        assert mem_bytes > 3.75 * 10**9
-        assert np.abs(image_chunked.flatten() - image.flatten()).max() < 1e-3
-
-    @slow
-    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-    def test_stable_diffusion_text2img_pipeline_fp16(self):
-        torch.cuda.reset_peak_memory_stats()
-        model_id = "CompVis/stable-diffusion-v1-4"
-        pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16).to(
-            torch_device
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "a photograph of an astronaut riding a horse"
-
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        output_chunked = pipe(
-            [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
-        )
-        image_chunked = output_chunked.images
-
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        with torch.autocast(torch_device):
-            output = pipe(
-                [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
-            )
-            image = output.images
-
-        # Make sure results are close enough
-        diff = np.abs(image_chunked.flatten() - image.flatten())
-        # They ARE different since ops are not run always at the same precision
-        # however, they should be extremely close.
-        assert diff.mean() < 2e-2
-
-    @slow
-    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-    def test_stable_diffusion_text2img_pipeline(self):
-        expected_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/text2img/astronaut_riding_a_horse.png"
-        )
-        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
-
-        model_id = "CompVis/stable-diffusion-v1-4"
-        pipe = StableDiffusionPipeline.from_pretrained(
-            model_id,
-            safety_checker=self.dummy_safety_checker,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "astronaut riding a horse"
-
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        output = pipe(prompt=prompt, strength=0.75, guidance_scale=7.5, generator=generator, output_type="np")
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-        assert np.abs(expected_image - image).max() < 1e-2
-
-    @slow
-    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-    def test_stable_diffusion_img2img_pipeline(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/sketch-mountains-input.jpg"
-        )
-        expected_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/fantasy_landscape.png"
-        )
-        init_image = init_image.resize((768, 512))
-        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
-
-        model_id = "CompVis/stable-diffusion-v1-4"
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            model_id,
-            safety_checker=self.dummy_safety_checker,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "A fantasy landscape, trending on artstation"
-
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            init_image=init_image,
-            strength=0.75,
-            guidance_scale=7.5,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
-
-        assert image.shape == (512, 768, 3)
-        # img2img is flaky across GPUs even in fp32, so using MAE here
-        assert np.abs(expected_image - image).mean() < 1e-2
-
-    @slow
-    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-    def test_stable_diffusion_img2img_pipeline_k_lms(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/sketch-mountains-input.jpg"
-        )
-        expected_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/fantasy_landscape_k_lms.png"
-        )
-        init_image = init_image.resize((768, 512))
-        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
-
-        lms = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
-
-        model_id = "CompVis/stable-diffusion-v1-4"
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            model_id,
-            scheduler=lms,
-            safety_checker=self.dummy_safety_checker,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "A fantasy landscape, trending on artstation"
-
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            init_image=init_image,
-            strength=0.75,
-            guidance_scale=7.5,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
-
-        assert image.shape == (512, 768, 3)
-        # img2img is flaky across GPUs even in fp32, so using MAE here
-        assert np.abs(expected_image - image).mean() < 1e-2
-
-    @slow
-    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-    def test_stable_diffusion_inpaint_pipeline(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
-        )
-        expected_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/yellow_cat_sitting_on_a_park_bench.png"
-        )
-        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
-
-        model_id = "runwayml/stable-diffusion-inpainting"
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            model_id,
-            safety_checker=self.dummy_safety_checker,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            mask_image=mask_image,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-        assert np.abs(expected_image - image).max() < 1e-2
+        generator = torch.manual_seed(0)
+        image = ldm(generator=generator, num_inference_steps=5, output_type="numpy").images
 
-    @slow
-    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-    def test_stable_diffusion_inpaint_pipeline_fp16(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
-        )
-        expected_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/yellow_cat_sitting_on_a_park_bench_fp16.png"
-        )
-        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+        image_slice = image[0, -3:, -3:, -1]
 
-        model_id = "runwayml/stable-diffusion-inpainting"
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            model_id,
-            revision="fp16",
-            torch_dtype=torch.float16,
-            safety_checker=None,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.4399, 0.44975, 0.46825, 0.474, 0.4359, 0.4581, 0.45095, 0.4341, 0.4447])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
-        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+    @slow
+    def test_ddpm_ddim_equality(self):
+        model_id = "google/ddpm-cifar10-32"
 
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            mask_image=mask_image,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
+        unet = UNet2DModel.from_pretrained(model_id)
+        ddpm_scheduler = DDPMScheduler()
+        ddim_scheduler = DDIMScheduler()
 
-        assert image.shape == (512, 512, 3)
-        assert np.abs(expected_image - image).max() < 1e-2
+        ddpm = DDPMPipeline(unet=unet, scheduler=ddpm_scheduler)
+        ddpm.to(torch_device)
+        ddpm.set_progress_bar_config(disable=None)
+        ddim = DDIMPipeline(unet=unet, scheduler=ddim_scheduler)
+        ddim.to(torch_device)
+        ddim.set_progress_bar_config(disable=None)
 
-    @slow
-    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-    def test_stable_diffusion_inpaint_legacy_pipeline(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
-        )
-        expected_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/red_cat_sitting_on_a_park_bench.png"
-        )
-        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+        generator = torch.manual_seed(0)
+        ddpm_image = ddpm(generator=generator, output_type="numpy").images
 
-        model_id = "CompVis/stable-diffusion-v1-4"
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            model_id,
-            safety_checker=self.dummy_safety_checker,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
+        generator = torch.manual_seed(0)
+        ddim_image = ddim(generator=generator, num_inference_steps=1000, eta=1.0, output_type="numpy").images
 
-        prompt = "A red cat sitting on a park bench"
+        # the values aren't exactly equal, but the images look the same visually
+        assert np.abs(ddpm_image - ddim_image).max() < 1e-1
 
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            init_image=init_image,
-            mask_image=mask_image,
-            strength=0.75,
-            guidance_scale=7.5,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
+    @unittest.skip("(Anton) The test is failing for large batch sizes, needs investigation")
+    def test_ddpm_ddim_equality_batched(self):
+        model_id = "google/ddpm-cifar10-32"
 
-        assert image.shape == (512, 512, 3)
-        assert np.abs(expected_image - image).max() < 1e-2
+        unet = UNet2DModel.from_pretrained(model_id)
+        ddpm_scheduler = DDPMScheduler()
+        ddim_scheduler = DDIMScheduler()
 
-    @slow
-    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-    def test_stable_diffusion_inpaint_pipeline_pndm(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
-        )
-        expected_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/yellow_cat_sitting_on_a_park_bench_pndm.png"
-        )
-        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+        ddpm = DDPMPipeline(unet=unet, scheduler=ddpm_scheduler)
+        ddpm.to(torch_device)
+        ddpm.set_progress_bar_config(disable=None)
 
-        pndm = PNDMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True)
-        model_id = "runwayml/stable-diffusion-inpainting"
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            model_id, safety_checker=self.dummy_safety_checker, scheduler=pndm
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
+        ddim = DDIMPipeline(unet=unet, scheduler=ddim_scheduler)
+        ddim.to(torch_device)
+        ddim.set_progress_bar_config(disable=None)
 
-        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+        generator = torch.manual_seed(0)
+        ddpm_images = ddpm(batch_size=4, generator=generator, output_type="numpy").images
 
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            mask_image=mask_image,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
+        generator = torch.manual_seed(0)
+        ddim_images = ddim(
+            batch_size=4, generator=generator, num_inference_steps=1000, eta=1.0, output_type="numpy"
+        ).images
 
-        assert image.shape == (512, 512, 3)
-        assert np.abs(expected_image - image).max() < 1e-2
+        # the values aren't exactly equal, but the images look the same visually
+        assert np.abs(ddpm_images - ddim_images).max() < 1e-1
 
     @slow
-    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-    def test_stable_diffusion_inpaint_legacy_pipeline_k_lms(self):
-        # TODO(Anton, Patrick) - I think we can remove this test soon
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
-        )
-        expected_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/red_cat_sitting_on_a_park_bench_k_lms.png"
-        )
-        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
-
-        lms = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+    def test_karras_ve_pipeline(self):
+        model_id = "google/ncsnpp-celebahq-256"
+        model = UNet2DModel.from_pretrained(model_id)
+        scheduler = KarrasVeScheduler()
 
-        model_id = "CompVis/stable-diffusion-v1-4"
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            model_id,
-            scheduler=lms,
-            safety_checker=self.dummy_safety_checker,
-        )
+        pipe = KarrasVePipeline(unet=model, scheduler=scheduler)
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "A red cat sitting on a park bench"
 
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            init_image=init_image,
-            mask_image=mask_image,
-            strength=0.75,
-            guidance_scale=7.5,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
+        generator = torch.manual_seed(0)
+        image = pipe(num_inference_steps=20, generator=generator, output_type="numpy").images
 
-        assert image.shape == (512, 512, 3)
-        assert np.abs(expected_image - image).max() < 1e-2
+        image_slice = image[0, -3:, -3:, -1]
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.578, 0.5811, 0.5924, 0.5809, 0.587, 0.5886, 0.5861, 0.5802, 0.586])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
     @slow
     def test_stable_diffusion_onnx(self):
@@ -2356,170 +918,6 @@ def test_stable_diffusion_inpaint_onnx(self):
         expected_slice = np.array([0.2951, 0.2955, 0.2922, 0.2036, 0.1977, 0.2279, 0.1716, 0.1641, 0.1799])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
-    @slow
-    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-    def test_stable_diffusion_text2img_intermediate_state(self):
-        number_of_steps = 0
-
-        def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
-            test_callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 0:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [1.8285, 1.2857, -0.1024, 1.2406, -2.3068, 1.0747, -0.0818, -0.6520, -2.9506]
-                )
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
-            elif step == 50:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [1.1078, 1.5803, 0.2773, -0.0589, -1.7928, -0.3665, -0.4695, -1.0727, -1.1601]
-                )
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-2
-
-        test_callback_fn.has_been_called = False
-
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
-        )
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "Andromeda galaxy in a bottle"
-
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        with torch.autocast(torch_device):
-            pipe(
-                prompt=prompt,
-                num_inference_steps=50,
-                guidance_scale=7.5,
-                generator=generator,
-                callback=test_callback_fn,
-                callback_steps=1,
-            )
-        assert test_callback_fn.has_been_called
-        assert number_of_steps == 51
-
-    @slow
-    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-    def test_stable_diffusion_img2img_intermediate_state(self):
-        number_of_steps = 0
-
-        def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
-            test_callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 0:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 96)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([0.9052, -0.0184, 0.4810, 0.2898, 0.5851, 1.4920, 0.5362, 1.9838, 0.0530])
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
-            elif step == 37:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 96)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([0.7071, 0.7831, 0.8300, 1.8140, 1.7840, 1.9402, 1.3651, 1.6590, 1.2828])
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-2
-
-        test_callback_fn.has_been_called = False
-
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/sketch-mountains-input.jpg"
-        )
-        init_image = init_image.resize((768, 512))
-
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "A fantasy landscape, trending on artstation"
-
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        with torch.autocast(torch_device):
-            pipe(
-                prompt=prompt,
-                init_image=init_image,
-                strength=0.75,
-                num_inference_steps=50,
-                guidance_scale=7.5,
-                generator=generator,
-                callback=test_callback_fn,
-                callback_steps=1,
-            )
-        assert test_callback_fn.has_been_called
-        assert number_of_steps == 38
-
-    @slow
-    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-    def test_stable_diffusion_inpaint_legacy_intermediate_state(self):
-        number_of_steps = 0
-
-        def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
-            test_callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 0:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.5472, 1.1218, -0.5505, -0.9390, -1.0794, 0.4063, 0.5158, 0.6429, -1.5246]
-                )
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
-            elif step == 37:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([0.4781, 1.1572, 0.6258, 0.2291, 0.2554, -0.1443, 0.7085, -0.1598, -0.5659])
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
-
-        test_callback_fn.has_been_called = False
-
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
-        )
-
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "A red cat sitting on a park bench"
-
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        with torch.autocast(torch_device):
-            pipe(
-                prompt=prompt,
-                init_image=init_image,
-                mask_image=mask_image,
-                strength=0.75,
-                num_inference_steps=50,
-                guidance_scale=7.5,
-                generator=generator,
-                callback=test_callback_fn,
-                callback_steps=1,
-            )
-        assert test_callback_fn.has_been_called
-        assert number_of_steps == 38
-
     @slow
     def test_stable_diffusion_onnx_intermediate_state(self):
         number_of_steps = 0

From dec18c86321de39ad853e187826a70c435467c16 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Fri, 21 Oct 2022 13:13:36 +0200
Subject: [PATCH 028/269] [Flax] dont warn for bf16 weights (#923)

dont warn for bf16 weights
---
 src/diffusers/modeling_flax_utils.py | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/src/diffusers/modeling_flax_utils.py b/src/diffusers/modeling_flax_utils.py
index 6cb30a26f7d5..e31a9b7b80d0 100644
--- a/src/diffusers/modeling_flax_utils.py
+++ b/src/diffusers/modeling_flax_utils.py
@@ -482,29 +482,6 @@ def from_pretrained(
                 " training."
             )
 
-        # dictionary of key: dtypes for the model params
-        param_dtypes = jax.tree_map(lambda x: x.dtype, state)
-        # extract keys of parameters not in jnp.float32
-        fp16_params = [k for k in param_dtypes if param_dtypes[k] == jnp.float16]
-        bf16_params = [k for k in param_dtypes if param_dtypes[k] == jnp.bfloat16]
-
-        # raise a warning if any of the parameters are not in jnp.float32
-        if len(fp16_params) > 0:
-            logger.warning(
-                f"Some of the weights of {model.__class__.__name__} were initialized in float16 precision from "
-                f"the model checkpoint at {pretrained_model_name_or_path}:\n{fp16_params}\n"
-                "You should probably UPCAST the model weights to float32 if this was not intended. "
-                "See [`~ModelMixin.to_fp32`] for further information on how to do this."
-            )
-
-        if len(bf16_params) > 0:
-            logger.warning(
-                f"Some of the weights of {model.__class__.__name__} were initialized in bfloat16 precision from "
-                f"the model checkpoint at {pretrained_model_name_or_path}:\n{bf16_params}\n"
-                "You should probably UPCAST the model weights to float32 if this was not intended. "
-                "See [`~ModelMixin.to_fp32`] for further information on how to do this."
-            )
-
         return model, unflatten_dict(state)
 
     def save_pretrained(

From 31af4d17e81308887ff63080d49fba644e6c3963 Mon Sep 17 00:00:00 2001
From: mkshing <33302880+mkshing@users.noreply.github.com>
Date: Fri, 21 Oct 2022 22:38:09 +0900
Subject: [PATCH 029/269] Support LMSDiscreteScheduler in LDMPipeline (#891)

* Support LMSDiscreteScheduler in LDMPipeline

This is a small change to support all schedulers such as LMSDiscreteScheduler in LDMPipeline.

What's changed
-------
* Add the `scale_model_input` function before `step` to ensure correct denoising (L77)

* Add "scale the initial noise by the standard deviation required by the scheduler"

* run `make style`

Co-authored-by: Anton Lozhkov <anton@huggingface.co>
---
 .../pipeline_latent_diffusion_uncond.py                     | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
index ef82abb7e6cb..a7ffb4adc99a 100644
--- a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
+++ b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
@@ -64,6 +64,9 @@ def __call__(
         )
         latents = latents.to(self.device)
 
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
         self.scheduler.set_timesteps(num_inference_steps)
 
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@@ -74,8 +77,9 @@ def __call__(
             extra_kwargs["eta"] = eta
 
         for t in self.progress_bar(self.scheduler.timesteps):
+            latent_model_input = self.scheduler.scale_model_input(latents, t)
             # predict the noise residual
-            noise_prediction = self.unet(latents, t).sample
+            noise_prediction = self.unet(latent_model_input, t).sample
             # compute the previous noisy sample x_t -> x_t-1
             latents = self.scheduler.step(noise_prediction, t, latents, **extra_kwargs).prev_sample
 

From 2fdd094c10aadd71c17d3e3d28e47b6c3cf26bd2 Mon Sep 17 00:00:00 2001
From: Shyam Sudhakaran <shyamsnair97@gmail.com>
Date: Fri, 21 Oct 2022 08:43:19 -0700
Subject: [PATCH 030/269] Wildcard stable diffusion pipeline (#900)

* Initial Wildcard Stable Diffusion Pipeline

* Added some additional example usage

* style

* Added links in README and additional documentation

* Initial Wildcard Stable Diffusion Pipeline

* Added some additional example usage

* style

* Added links in README and additional documentation

* cleanup readme again

* Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 examples/community/README.md                  |  58 +++
 .../community/wildcard_stable_diffusion.py    | 418 ++++++++++++++++++
 2 files changed, 476 insertions(+)
 create mode 100644 examples/community/wildcard_stable_diffusion.py

diff --git a/examples/community/README.md b/examples/community/README.md
index 9dc98c1ead2f..a5b2252d4408 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -14,6 +14,7 @@ If a community doesn't work as expected, please open an issue and ping the autho
 | Stable Diffusion Mega                  | **One** Stable Diffusion Pipeline with all functionalities of [Text2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py), [Image2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) and [Inpainting](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | [Stable Diffusion Mega](#stable-diffusion-mega)                   | -                                                                                                                                                                                                                  | [Patrick von Platen](https://github.com/patrickvonplaten/) |
 | Long Prompt Weighting Stable Diffusion | **One** Stable Diffusion Pipeline without tokens length limit, and support parsing weighting in prompt.                                                                                                                                                                                                                                                                                                                                                                                                  | [Long Prompt Weighting Stable Diffusion](#long-prompt-weighting-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [SkyTNT](https://github.com/SkyTNT) |
 | Speech to Image                        | Using automatic-speech-recognition to transcribe text and Stable Diffusion to generate images                                                                                                                                                                                                                                                                                                                                                                                                            | [Speech to Image](#speech-to-image)                               | -                                                                                                                                                                                                                  | [Mikail Duzenli](https://github.com/MikailINTech)
+| Wild Card Stable Diffusion | Stable Diffusion Pipeline that supports prompts that contain wildcard terms (indicated by surrounding double underscores), with values instantiated randomly from a corresponding txt file or a dictionary of possible values                                                                                                                                                                                                                                                                                                     | [Wildcard Stable Diffusion](#wildcard-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Shyam Sudhakaran](https://github.com/shyamsn97) |
 
 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
 ```py
@@ -264,3 +265,60 @@ plt.imshow(output.images[0])
 This example produces the following image:
 
 ![image](https://user-images.githubusercontent.com/45072645/196901736-77d9c6fc-63ee-4072-90b0-dc8b903d63e3.png)
+
+### Wildcard Stable Diffusion
+Following the great examples from https://github.com/jtkelm2/stable-diffusion-webui-1/blob/master/scripts/wildcards.py and https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Custom-Scripts#wildcards, here's a minimal implementation that allows for users to add "wildcards", denoted by `__wildcard__` to prompts that are used as placeholders for randomly sampled values given by either a dictionary or a `.txt` file. For example:
+
+Say we have a prompt:
+
+```
+prompt = "__animal__ sitting on a __object__ wearing a __clothing__"
+```
+
+We can then define possible values to be sampled for `animal`, `object`, and `clothing`. These can either be from a `.txt` with the same name as the category.
+
+The possible values can also be defined / combined by using a dictionary like: `{"animal":["dog", "cat", mouse"]}`.
+
+The actual pipeline works just like `StableDiffusionPipeline`, except the `__call__` method takes in:
+
+`wildcard_files`: list of file paths for wild card replacement
+`wildcard_option_dict`: dict with key as `wildcard` and values as a list of possible replacements
+`num_prompt_samples`: number of prompts to sample, uniformly sampling wildcards
+
+A full example:
+
+create `animal.txt`, with contents like:
+
+```
+dog
+cat
+mouse
+```
+
+create `object.txt`, with contents like:
+
+```
+chair
+sofa
+bench
+```
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="wildcard_stable_diffusion",
+    revision="fp16",
+    torch_dtype=torch.float16,
+)
+prompt = "__animal__ sitting on a __object__ wearing a __clothing__"
+out = pipe(
+    prompt,
+    wildcard_option_dict={
+        "clothing":["hat", "shirt", "scarf", "beret"]
+    },
+    wildcard_files=["object.txt", "animal.txt"],
+    num_prompt_samples=1
+)
diff --git a/examples/community/wildcard_stable_diffusion.py b/examples/community/wildcard_stable_diffusion.py
new file mode 100644
index 000000000000..79cb4feb1e7c
--- /dev/null
+++ b/examples/community/wildcard_stable_diffusion.py
@@ -0,0 +1,418 @@
+import inspect
+import os
+import random
+import re
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Union
+
+import torch
+
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import deprecate, logging
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+global_re_wildcard = re.compile(r"__([^_]*)__")
+
+
+def get_filename(path: str):
+    # this doesn't work on Windows
+    return os.path.basename(path).split(".txt")[0]
+
+
+def read_wildcard_values(path: str):
+    with open(path, encoding="utf8") as f:
+        return f.read().splitlines()
+
+
+def grab_wildcard_values(wildcard_option_dict: Dict[str, List[str]] = {}, wildcard_files: List[str] = []):
+    for wildcard_file in wildcard_files:
+        filename = get_filename(wildcard_file)
+        read_values = read_wildcard_values(wildcard_file)
+        if filename not in wildcard_option_dict:
+            wildcard_option_dict[filename] = []
+        wildcard_option_dict[filename].extend(read_values)
+    return wildcard_option_dict
+
+
+def replace_prompt_with_wildcards(
+    prompt: str, wildcard_option_dict: Dict[str, List[str]] = {}, wildcard_files: List[str] = []
+):
+    new_prompt = prompt
+
+    # get wildcard options
+    wildcard_option_dict = grab_wildcard_values(wildcard_option_dict, wildcard_files)
+
+    for m in global_re_wildcard.finditer(new_prompt):
+        wildcard_value = m.group()
+        replace_value = random.choice(wildcard_option_dict[wildcard_value.strip("__")])
+        new_prompt = new_prompt.replace(wildcard_value, replace_value, 1)
+
+    return new_prompt
+
+
+@dataclass
+class WildcardStableDiffusionOutput(StableDiffusionPipelineOutput):
+    prompts: List[str]
+
+
+class WildcardStableDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Example Usage:
+        pipe = WildcardStableDiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            revision="fp16",
+            torch_dtype=torch.float16,
+        )
+        prompt = "__animal__ sitting on a __object__ wearing a __clothing__"
+        out = pipe(
+            prompt,
+            wildcard_option_dict={
+                "clothing":["hat", "shirt", "scarf", "beret"]
+            },
+            wildcard_files=["object.txt", "animal.txt"],
+            num_prompt_samples=1
+        )
+
+
+    Pipeline for text-to-image generation with wild cards using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warn(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        wildcard_option_dict: Dict[str, List[str]] = {},
+        wildcard_files: List[str] = [],
+        num_prompt_samples: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            wildcard_option_dict (Dict[str, List[str]]):
+                dict with key as `wildcard` and values as a list of possible replacements. For example if a prompt, "A __animal__ sitting on a chair". A wildcard_option_dict can provide possible values for "animal" like this: {"animal":["dog", "cat", "fox"]}
+            wildcard_files: (List[str])
+               List of filenames of txt files for wildcard replacements. For example if a prompt, "A __animal__ sitting on a chair". A file can be provided ["animal.txt"]
+            num_prompt_samples: int
+                Number of times to sample wildcards for each prompt provided
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        if isinstance(prompt, str):
+            prompt = [
+                replace_prompt_with_wildcards(prompt, wildcard_option_dict, wildcard_files)
+                for i in range(num_prompt_samples)
+            ]
+            batch_size = len(prompt)
+        elif isinstance(prompt, list):
+            prompt_list = []
+            for p in prompt:
+                for i in range(num_prompt_samples):
+                    prompt_list.append(replace_prompt_with_wildcards(p, wildcard_option_dict, wildcard_files))
+            prompt = prompt_list
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""]
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not exist on mps
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(self.device)
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
+                self.device
+            )
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+            )
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return WildcardStableDiffusionOutput(images=image, nsfw_content_detected=has_nsfw_concept, prompts=prompt)

From 9bca40296e3f00fb26597a0f4cfe2fdfd2ad2fd2 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Sat, 22 Oct 2022 09:33:40 +0200
Subject: [PATCH 031/269] [MPS] fix mps failing tests (#934)

fix mps failing tests
---
 tests/test_models_vae.py | 16 +++++++++++++++-
 tests/test_pipelines.py  |  8 +++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/tests/test_models_vae.py b/tests/test_models_vae.py
index 9fb7e8ea3bb7..49610f848364 100644
--- a/tests/test_models_vae.py
+++ b/tests/test_models_vae.py
@@ -106,7 +106,21 @@ def test_output_pretrained(self):
 
         # Since the VAE Gaussian prior's generator is seeded on the appropriate device,
         # the expected output slices are not the same for CPU and GPU.
-        if torch_device in ("mps", "cpu"):
+        if torch_device == "mps":
+            expected_output_slice = torch.tensor(
+                [
+                    -4.0078e-01,
+                    -3.8323e-04,
+                    -1.2681e-01,
+                    -1.1462e-01,
+                    2.0095e-01,
+                    1.0893e-01,
+                    -8.8247e-02,
+                    -3.0361e-01,
+                    -9.8644e-03,
+                ]
+            )
+        elif torch_device == "cpu":
             expected_output_slice = torch.tensor(
                 [-0.1352, 0.0878, 0.0419, -0.0818, -0.1069, 0.0688, -0.1458, -0.4446, -0.0026]
             )
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 9d1c41eee93c..025288e03207 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -455,7 +455,13 @@ def test_components(self):
         text2img = StableDiffusionPipeline(**inpaint.components).to(torch_device)
 
         prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=torch_device).manual_seed(0)
+
+        # Device type MPS is not supported for torch.Generator() api.
+        if torch_device == "mps":
+            generator = torch.manual_seed(0)
+        else:
+            generator = torch.Generator(device=torch_device).manual_seed(0)
+
         image_inpaint = inpaint(
             [prompt],
             generator=generator,

From 2d35f6733a2d698e8917896071444a5923993ae7 Mon Sep 17 00:00:00 2001
From: Chenguo Lin <linchenguo0622@gmail.com>
Date: Mon, 24 Oct 2022 17:18:32 +0800
Subject: [PATCH 032/269] fix a small typo in pipeline_ddpm.py (#948)

one small typo in pipeline_ddpm.py

just a small typo in one comment
---
 src/diffusers/pipelines/ddpm/pipeline_ddpm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
index aae29737aae3..5842a6d297bb 100644
--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -79,7 +79,7 @@ def __call__(
             # 1. predict noise model_output
             model_output = self.unet(image, t).sample
 
-            # 2. compute previous image: x_t -> t_t-1
+            # 2. compute previous image: x_t -> x_t-1
             image = self.scheduler.step(model_output, t, image, generator=generator).prev_sample
 
         image = (image / 2 + 0.5).clamp(0, 1)

From 2c82e0c4eb47ba59bdd98edf6809ec17018e5989 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Mon, 24 Oct 2022 16:34:01 +0200
Subject: [PATCH 033/269] Reorganize pipeline tests (#963)

* Reorganize pipeline tests

* fix vq
---
 src/diffusers/utils/testing_utils.py          |  23 +-
 tests/pipelines/__init__.py                   |   0
 tests/pipelines/ddim/__init__.py              |   0
 tests/pipelines/ddim/test_ddim.py             | 114 ++++
 tests/pipelines/ddpm/__init__.py              |   0
 tests/pipelines/ddpm/test_ddpm.py             |  55 ++
 tests/pipelines/karras_ve/__init__.py         |   0
 tests/pipelines/karras_ve/test_karras_ve.py   |  86 ++++
 tests/pipelines/latent_diffusion/__init__.py  |   0
 .../latent_diffusion/test_latent_diffusion.py | 153 ++++++
 .../test_latent_diffusion_uncond.py           | 119 +++++
 tests/pipelines/pndm/__init__.py              |   0
 tests/pipelines/pndm/test_pndm.py             |  87 ++++
 tests/pipelines/score_sde_ve/__init__.py      |   0
 .../score_sde_ve/test_score_sde_ve.py         |  91 ++++
 tests/pipelines/stable_diffusion/__init__.py  |   0
 .../test_onnx_stable_diffusion.py             |  84 +++
 .../test_onnx_stable_diffusion_img2img.py     |  62 +++
 .../test_onnx_stable_diffusion_inpaint.py     |  65 +++
 .../stable_diffusion/test_stable_diffusion.py |   9 +-
 .../test_stable_diffusion_img2img.py          |   9 +-
 .../test_stable_diffusion_inpaint.py          |   9 +-
 .../test_stable_diffusion_inpaint_legacy.py   |   9 +-
 tests/test_pipelines.py                       | 487 +-----------------
 tests/test_pipelines_common.py                |  12 +
 tests/test_pipelines_flax.py                  |   2 +-
 tests/test_pipelines_onnx_common.py           |  12 +
 27 files changed, 993 insertions(+), 495 deletions(-)
 create mode 100644 tests/pipelines/__init__.py
 create mode 100644 tests/pipelines/ddim/__init__.py
 create mode 100644 tests/pipelines/ddim/test_ddim.py
 create mode 100644 tests/pipelines/ddpm/__init__.py
 create mode 100644 tests/pipelines/ddpm/test_ddpm.py
 create mode 100644 tests/pipelines/karras_ve/__init__.py
 create mode 100644 tests/pipelines/karras_ve/test_karras_ve.py
 create mode 100644 tests/pipelines/latent_diffusion/__init__.py
 create mode 100644 tests/pipelines/latent_diffusion/test_latent_diffusion.py
 create mode 100644 tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
 create mode 100644 tests/pipelines/pndm/__init__.py
 create mode 100644 tests/pipelines/pndm/test_pndm.py
 create mode 100644 tests/pipelines/score_sde_ve/__init__.py
 create mode 100644 tests/pipelines/score_sde_ve/test_score_sde_ve.py
 create mode 100644 tests/pipelines/stable_diffusion/__init__.py
 create mode 100644 tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
 create mode 100644 tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
 create mode 100644 tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
 create mode 100644 tests/test_pipelines_common.py
 create mode 100644 tests/test_pipelines_onnx_common.py

diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index 77963ad7242e..700faf209926 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -14,7 +14,7 @@
 import requests
 from packaging import version
 
-from .import_utils import is_flax_available, is_torch_available
+from .import_utils import is_flax_available, is_onnx_available, is_torch_available
 
 
 global_rng = random.Random()
@@ -100,6 +100,20 @@ def slow(test_case):
     return unittest.skipUnless(_run_slow_tests, "test is slow")(test_case)
 
 
+def require_torch(test_case):
+    """
+    Decorator marking a test that requires PyTorch. These tests are skipped when PyTorch isn't installed.
+    """
+    return unittest.skipUnless(is_torch_available(), "test requires PyTorch")(test_case)
+
+
+def require_torch_gpu(test_case):
+    """Decorator marking a test that requires CUDA and PyTorch."""
+    return unittest.skipUnless(is_torch_available() and torch_device == "cuda", "test requires PyTorch+CUDA")(
+        test_case
+    )
+
+
 def require_flax(test_case):
     """
     Decorator marking a test that requires JAX & Flax. These tests are skipped when one / both are not installed
@@ -107,6 +121,13 @@ def require_flax(test_case):
     return unittest.skipUnless(is_flax_available(), "test requires JAX & Flax")(test_case)
 
 
+def require_onnxruntime(test_case):
+    """
+    Decorator marking a test that requires onnxruntime. These tests are skipped when onnxruntime isn't installed.
+    """
+    return unittest.skipUnless(is_onnx_available(), "test requires onnxruntime")(test_case)
+
+
 def load_image(image: Union[str, PIL.Image.Image]) -> PIL.Image.Image:
     """
     Args:
diff --git a/tests/pipelines/__init__.py b/tests/pipelines/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/ddim/__init__.py b/tests/pipelines/ddim/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/ddim/test_ddim.py b/tests/pipelines/ddim/test_ddim.py
new file mode 100644
index 000000000000..4445fe7feecf
--- /dev/null
+++ b/tests/pipelines/ddim/test_ddim.py
@@ -0,0 +1,114 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import DDIMPipeline, DDIMScheduler, UNet2DModel
+from diffusers.utils.testing_utils import require_torch, slow, torch_device
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class DDIMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    @property
+    def dummy_uncond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
+        return model
+
+    def test_inference(self):
+        unet = self.dummy_uncond_unet
+        scheduler = DDIMScheduler()
+
+        ddpm = DDIMPipeline(unet=unet, scheduler=scheduler)
+        ddpm.to(torch_device)
+        ddpm.set_progress_bar_config(disable=None)
+
+        # Warmup pass when using mps (see #372)
+        if torch_device == "mps":
+            _ = ddpm(num_inference_steps=1)
+
+        generator = torch.manual_seed(0)
+        image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images
+
+        generator = torch.manual_seed(0)
+        image_from_tuple = ddpm(generator=generator, num_inference_steps=2, output_type="numpy", return_dict=False)[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array(
+            [1.000e00, 5.717e-01, 4.717e-01, 1.000e00, 0.000e00, 1.000e00, 3.000e-04, 0.000e00, 9.000e-04]
+        )
+        tolerance = 1e-2 if torch_device != "mps" else 3e-2
+        assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < tolerance
+
+
+@slow
+@require_torch
+class DDIMPipelineIntegrationTests(unittest.TestCase):
+    def test_inference_ema_bedroom(self):
+        model_id = "google/ddpm-ema-bedroom-256"
+
+        unet = UNet2DModel.from_pretrained(model_id)
+        scheduler = DDIMScheduler.from_config(model_id)
+
+        ddpm = DDIMPipeline(unet=unet, scheduler=scheduler)
+        ddpm.to(torch_device)
+        ddpm.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        image = ddpm(generator=generator, output_type="numpy").images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.00605, 0.0201, 0.0344, 0.00235, 0.00185, 0.00025, 0.00215, 0.0, 0.00685])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_inference_cifar10(self):
+        model_id = "google/ddpm-cifar10-32"
+
+        unet = UNet2DModel.from_pretrained(model_id)
+        scheduler = DDIMScheduler()
+
+        ddim = DDIMPipeline(unet=unet, scheduler=scheduler)
+        ddim.to(torch_device)
+        ddim.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        image = ddim(generator=generator, eta=0.0, output_type="numpy").images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.17235, 0.16175, 0.16005, 0.16255, 0.1497, 0.1513, 0.15045, 0.1442, 0.1453])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/ddpm/__init__.py b/tests/pipelines/ddpm/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/ddpm/test_ddpm.py b/tests/pipelines/ddpm/test_ddpm.py
new file mode 100644
index 000000000000..c58e2db38f21
--- /dev/null
+++ b/tests/pipelines/ddpm/test_ddpm.py
@@ -0,0 +1,55 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
+from diffusers.utils.testing_utils import require_torch, slow, torch_device
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class DDPMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    # FIXME: add fast tests
+    pass
+
+
+@slow
+@require_torch
+class DDPMPipelineIntegrationTests(unittest.TestCase):
+    def test_inference_cifar10(self):
+        model_id = "google/ddpm-cifar10-32"
+
+        unet = UNet2DModel.from_pretrained(model_id)
+        scheduler = DDPMScheduler.from_config(model_id)
+
+        ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
+        ddpm.to(torch_device)
+        ddpm.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        image = ddpm(generator=generator, output_type="numpy").images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.41995, 0.35885, 0.19385, 0.38475, 0.3382, 0.2647, 0.41545, 0.3582, 0.33845])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/karras_ve/__init__.py b/tests/pipelines/karras_ve/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/karras_ve/test_karras_ve.py b/tests/pipelines/karras_ve/test_karras_ve.py
new file mode 100644
index 000000000000..1fafa1cb40fa
--- /dev/null
+++ b/tests/pipelines/karras_ve/test_karras_ve.py
@@ -0,0 +1,86 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import KarrasVePipeline, KarrasVeScheduler, UNet2DModel
+from diffusers.utils.testing_utils import require_torch, slow, torch_device
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class KarrasVePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    @property
+    def dummy_uncond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
+        return model
+
+    def test_inference(self):
+        unet = self.dummy_uncond_unet
+        scheduler = KarrasVeScheduler()
+
+        pipe = KarrasVePipeline(unet=unet, scheduler=scheduler)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        image = pipe(num_inference_steps=2, generator=generator, output_type="numpy").images
+
+        generator = torch.manual_seed(0)
+        image_from_tuple = pipe(num_inference_steps=2, generator=generator, output_type="numpy", return_dict=False)[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+
+@slow
+@require_torch
+class KarrasVePipelineIntegrationTests(unittest.TestCase):
+    def test_inference(self):
+        model_id = "google/ncsnpp-celebahq-256"
+        model = UNet2DModel.from_pretrained(model_id)
+        scheduler = KarrasVeScheduler()
+
+        pipe = KarrasVePipeline(unet=model, scheduler=scheduler)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        image = pipe(num_inference_steps=20, generator=generator, output_type="numpy").images
+
+        image_slice = image[0, -3:, -3:, -1]
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.578, 0.5811, 0.5924, 0.5809, 0.587, 0.5886, 0.5861, 0.5802, 0.586])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/latent_diffusion/__init__.py b/tests/pipelines/latent_diffusion/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion.py b/tests/pipelines/latent_diffusion/test_latent_diffusion.py
new file mode 100644
index 000000000000..085cdb4e766b
--- /dev/null
+++ b/tests/pipelines/latent_diffusion/test_latent_diffusion.py
@@ -0,0 +1,153 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import AutoencoderKL, DDIMScheduler, LDMTextToImagePipeline, UNet2DConditionModel
+from diffusers.utils.testing_utils import require_torch, slow, torch_device
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class LDMTextToImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    @property
+    def dummy_cond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
+    @property
+    def dummy_vae(self):
+        torch.manual_seed(0)
+        model = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        return model
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModel(config)
+
+    def test_inference_text2img(self):
+        unet = self.dummy_cond_unet
+        scheduler = DDIMScheduler()
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        ldm = LDMTextToImagePipeline(vqvae=vae, bert=bert, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+        ldm.to(torch_device)
+        ldm.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+
+        # Warmup pass when using mps (see #372)
+        if torch_device == "mps":
+            generator = torch.manual_seed(0)
+            _ = ldm(
+                [prompt], generator=generator, guidance_scale=6.0, num_inference_steps=1, output_type="numpy"
+            ).images
+
+        generator = torch.manual_seed(0)
+        image = ldm(
+            [prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="numpy"
+        ).images
+
+        generator = torch.manual_seed(0)
+        image_from_tuple = ldm(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="numpy",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5074, 0.5026, 0.4998, 0.4056, 0.3523, 0.4649, 0.5289, 0.5299, 0.4897])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+
+@slow
+@require_torch
+class LDMTextToImagePipelineIntegrationTests(unittest.TestCase):
+    def test_inference_text2img(self):
+        ldm = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256")
+        ldm.to(torch_device)
+        ldm.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.manual_seed(0)
+        image = ldm(
+            [prompt], generator=generator, guidance_scale=6.0, num_inference_steps=20, output_type="numpy"
+        ).images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.9256, 0.9340, 0.8933, 0.9361, 0.9113, 0.8727, 0.9122, 0.8745, 0.8099])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_inference_text2img_fast(self):
+        ldm = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256")
+        ldm.to(torch_device)
+        ldm.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.manual_seed(0)
+        image = ldm(prompt, generator=generator, num_inference_steps=1, output_type="numpy").images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.3163, 0.8670, 0.6465, 0.1865, 0.6291, 0.5139, 0.2824, 0.3723, 0.4344])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
new file mode 100644
index 000000000000..dea2971cbb95
--- /dev/null
+++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
@@ -0,0 +1,119 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import DDIMScheduler, LDMPipeline, UNet2DModel, VQModel
+from diffusers.utils.testing_utils import require_torch, slow, torch_device
+from transformers import CLIPTextConfig, CLIPTextModel
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class LDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    @property
+    def dummy_uncond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
+        return model
+
+    @property
+    def dummy_vq_model(self):
+        torch.manual_seed(0)
+        model = VQModel(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=3,
+        )
+        return model
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModel(config)
+
+    def test_inference_uncond(self):
+        unet = self.dummy_uncond_unet
+        scheduler = DDIMScheduler()
+        vae = self.dummy_vq_model
+
+        ldm = LDMPipeline(unet=unet, vqvae=vae, scheduler=scheduler)
+        ldm.to(torch_device)
+        ldm.set_progress_bar_config(disable=None)
+
+        # Warmup pass when using mps (see #372)
+        if torch_device == "mps":
+            generator = torch.manual_seed(0)
+            _ = ldm(generator=generator, num_inference_steps=1, output_type="numpy").images
+
+        generator = torch.manual_seed(0)
+        image = ldm(generator=generator, num_inference_steps=2, output_type="numpy").images
+
+        generator = torch.manual_seed(0)
+        image_from_tuple = ldm(generator=generator, num_inference_steps=2, output_type="numpy", return_dict=False)[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.8512, 0.818, 0.6411, 0.6808, 0.4465, 0.5618, 0.46, 0.6231, 0.5172])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+
+@slow
+@require_torch
+class LDMPipelineIntegrationTests(unittest.TestCase):
+    def test_inference_uncond(self):
+        ldm = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256")
+        ldm.to(torch_device)
+        ldm.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        image = ldm(generator=generator, num_inference_steps=5, output_type="numpy").images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.4399, 0.44975, 0.46825, 0.474, 0.4359, 0.4581, 0.45095, 0.4341, 0.4447])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/pndm/__init__.py b/tests/pipelines/pndm/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/pndm/test_pndm.py b/tests/pipelines/pndm/test_pndm.py
new file mode 100644
index 000000000000..5d9212223e6e
--- /dev/null
+++ b/tests/pipelines/pndm/test_pndm.py
@@ -0,0 +1,87 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import PNDMPipeline, PNDMScheduler, UNet2DModel
+from diffusers.utils.testing_utils import require_torch, slow, torch_device
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class PNDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    @property
+    def dummy_uncond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
+        return model
+
+    def test_inference(self):
+        unet = self.dummy_uncond_unet
+        scheduler = PNDMScheduler()
+
+        pndm = PNDMPipeline(unet=unet, scheduler=scheduler)
+        pndm.to(torch_device)
+        pndm.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        image = pndm(generator=generator, num_inference_steps=20, output_type="numpy").images
+
+        generator = torch.manual_seed(0)
+        image_from_tuple = pndm(generator=generator, num_inference_steps=20, output_type="numpy", return_dict=False)[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+
+@slow
+@require_torch
+class PNDMPipelineIntegrationTests(unittest.TestCase):
+    def test_inference_cifar10(self):
+        model_id = "google/ddpm-cifar10-32"
+
+        unet = UNet2DModel.from_pretrained(model_id)
+        scheduler = PNDMScheduler()
+
+        pndm = PNDMPipeline(unet=unet, scheduler=scheduler)
+        pndm.to(torch_device)
+        pndm.set_progress_bar_config(disable=None)
+        generator = torch.manual_seed(0)
+        image = pndm(generator=generator, output_type="numpy").images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.1564, 0.14645, 0.1406, 0.14715, 0.12425, 0.14045, 0.13115, 0.12175, 0.125])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/score_sde_ve/__init__.py b/tests/pipelines/score_sde_ve/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/score_sde_ve/test_score_sde_ve.py b/tests/pipelines/score_sde_ve/test_score_sde_ve.py
new file mode 100644
index 000000000000..55dcc1cea1e0
--- /dev/null
+++ b/tests/pipelines/score_sde_ve/test_score_sde_ve.py
@@ -0,0 +1,91 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import ScoreSdeVePipeline, ScoreSdeVeScheduler, UNet2DModel
+from diffusers.utils.testing_utils import require_torch, slow, torch_device
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class ScoreSdeVeipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    @property
+    def dummy_uncond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
+        return model
+
+    def test_inference(self):
+        unet = self.dummy_uncond_unet
+        scheduler = ScoreSdeVeScheduler()
+
+        sde_ve = ScoreSdeVePipeline(unet=unet, scheduler=scheduler)
+        sde_ve.to(torch_device)
+        sde_ve.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        image = sde_ve(num_inference_steps=2, output_type="numpy", generator=generator).images
+
+        generator = torch.manual_seed(0)
+        image_from_tuple = sde_ve(num_inference_steps=2, output_type="numpy", generator=generator, return_dict=False)[
+            0
+        ]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+
+@slow
+@require_torch
+class ScoreSdeVePipelineIntegrationTests(unittest.TestCase):
+    def test_inference(self):
+        model_id = "google/ncsnpp-church-256"
+        model = UNet2DModel.from_pretrained(model_id)
+
+        scheduler = ScoreSdeVeScheduler.from_config(model_id)
+
+        sde_ve = ScoreSdeVePipeline(unet=model, scheduler=scheduler)
+        sde_ve.to(torch_device)
+        sde_ve.set_progress_bar_config(disable=None)
+
+        generator = torch.manual_seed(0)
+        image = sde_ve(num_inference_steps=10, output_type="numpy", generator=generator).images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 256, 256, 3)
+
+        expected_slice = np.array([0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/stable_diffusion/__init__.py b/tests/pipelines/stable_diffusion/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
new file mode 100644
index 000000000000..1275b7f980c1
--- /dev/null
+++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
@@ -0,0 +1,84 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from diffusers import OnnxStableDiffusionPipeline
+from diffusers.utils.testing_utils import require_onnxruntime, slow
+
+from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
+
+
+class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase):
+    # FIXME: add fast tests
+    pass
+
+
+@slow
+@require_onnxruntime
+class OnnxStableDiffusionPipelineIntegrationTests(unittest.TestCase):
+    def test_inference(self):
+        sd_pipe = OnnxStableDiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", revision="onnx", provider="CPUExecutionProvider"
+        )
+
+        prompt = "A painting of a squirrel eating a burger"
+        np.random.seed(0)
+        output = sd_pipe([prompt], guidance_scale=6.0, num_inference_steps=5, output_type="np")
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.3602, 0.3688, 0.3652, 0.3895, 0.3782, 0.3747, 0.3927, 0.4241, 0.4327])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_intermediate_state(self):
+        number_of_steps = 0
+
+        def test_callback_fn(step: int, timestep: int, latents: np.ndarray) -> None:
+            test_callback_fn.has_been_called = True
+            nonlocal number_of_steps
+            number_of_steps += 1
+            if step == 0:
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array(
+                    [-0.5950, -0.3039, -1.1672, 0.1594, -1.1572, 0.6719, -1.9712, -0.0403, 0.9592]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
+            elif step == 5:
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array(
+                    [-0.4776, -0.0119, -0.8519, -0.0275, -0.9764, 0.9820, -0.3843, 0.3788, 1.2264]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
+
+        test_callback_fn.has_been_called = False
+
+        pipe = OnnxStableDiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", revision="onnx", provider="CPUExecutionProvider"
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "Andromeda galaxy in a bottle"
+
+        np.random.seed(0)
+        pipe(prompt=prompt, num_inference_steps=5, guidance_scale=7.5, callback=test_callback_fn, callback_steps=1)
+        assert test_callback_fn.has_been_called
+        assert number_of_steps == 6
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
new file mode 100644
index 000000000000..25f1b7574265
--- /dev/null
+++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
@@ -0,0 +1,62 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from diffusers import OnnxStableDiffusionImg2ImgPipeline
+from diffusers.utils.testing_utils import load_image, require_onnxruntime, slow
+
+from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
+
+
+class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase):
+    # FIXME: add fast tests
+    pass
+
+
+@slow
+@require_onnxruntime
+class OnnxStableDiffusionPipelineIntegrationTests(unittest.TestCase):
+    def test_inference(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        )
+        init_image = init_image.resize((768, 512))
+        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", revision="onnx", provider="CPUExecutionProvider"
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A fantasy landscape, trending on artstation"
+
+        np.random.seed(0)
+        output = pipe(
+            prompt=prompt,
+            init_image=init_image,
+            strength=0.75,
+            guidance_scale=7.5,
+            num_inference_steps=8,
+            output_type="np",
+        )
+        images = output.images
+        image_slice = images[0, 255:258, 383:386, -1]
+
+        assert images.shape == (1, 512, 768, 3)
+        expected_slice = np.array([0.4830, 0.5242, 0.5603, 0.5016, 0.5131, 0.5111, 0.4928, 0.5025, 0.5055])
+        # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
new file mode 100644
index 000000000000..3f33022c112e
--- /dev/null
+++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
@@ -0,0 +1,65 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from diffusers import OnnxStableDiffusionInpaintPipeline
+from diffusers.utils.testing_utils import load_image, require_onnxruntime, slow
+
+from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
+
+
+class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase):
+    # FIXME: add fast tests
+    pass
+
+
+@slow
+@require_onnxruntime
+class OnnxStableDiffusionPipelineIntegrationTests(unittest.TestCase):
+    def test_stable_diffusion_inpaint_onnx(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
+        )
+
+        pipe = OnnxStableDiffusionInpaintPipeline.from_pretrained(
+            "runwayml/stable-diffusion-inpainting", revision="onnx", provider="CPUExecutionProvider"
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A red cat sitting on a park bench"
+
+        np.random.seed(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            mask_image=mask_image,
+            guidance_scale=7.5,
+            num_inference_steps=8,
+            output_type="np",
+        )
+        images = output.images
+        image_slice = images[0, 255:258, 255:258, -1]
+
+        assert images.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.2951, 0.2955, 0.2922, 0.2036, 0.1977, 0.2279, 0.1716, 0.1641, 0.1799])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index e50a22dbaaf8..d7e6c362d1d7 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -31,13 +31,16 @@
     VQModel,
 )
 from diffusers.utils import floats_tensor, load_image, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
+from ...test_pipelines_common import PipelineTesterMixin
+
 
 torch.backends.cuda.matmul.allow_tf32 = False
 
 
-class PipelineFastTests(unittest.TestCase):
+class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     def tearDown(self):
         # clean up the VRAM after each test
         super().tearDown()
@@ -514,8 +517,8 @@ def test_stable_diffusion_fp16(self):
 
 
 @slow
-@unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-class PipelineIntegrationTests(unittest.TestCase):
+@require_torch_gpu
+class StableDiffusionPipelineIntegrationTests(unittest.TestCase):
     def tearDown(self):
         # clean up the VRAM after each test
         super().tearDown()
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 473014fa2ea7..409bb225567a 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -30,13 +30,16 @@
     VQModel,
 )
 from diffusers.utils import floats_tensor, load_image, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
+from ...test_pipelines_common import PipelineTesterMixin
+
 
 torch.backends.cuda.matmul.allow_tf32 = False
 
 
-class PipelineFastTests(unittest.TestCase):
+class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     def tearDown(self):
         # clean up the VRAM after each test
         super().tearDown()
@@ -461,8 +464,8 @@ def test_stable_diffusion_img2img_fp16(self):
 
 
 @slow
-@unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-class PipelineIntegrationTests(unittest.TestCase):
+@require_torch_gpu
+class StableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
     def tearDown(self):
         # clean up the VRAM after each test
         super().tearDown()
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 5244bda1cbb0..14b4990784e4 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -29,14 +29,17 @@
     VQModel,
 )
 from diffusers.utils import floats_tensor, load_image, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu
 from PIL import Image
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
+from ...test_pipelines_common import PipelineTesterMixin
+
 
 torch.backends.cuda.matmul.allow_tf32 = False
 
 
-class PipelineFastTests(unittest.TestCase):
+class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     def tearDown(self):
         # clean up the VRAM after each test
         super().tearDown()
@@ -258,8 +261,8 @@ def test_stable_diffusion_inpaint_fp16(self):
 
 
 @slow
-@unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-class PipelineIntegrationTests(unittest.TestCase):
+@require_torch_gpu
+class StableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase):
     def tearDown(self):
         # clean up the VRAM after each test
         super().tearDown()
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
index 3259ed49f4f2..697da4c921df 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
@@ -31,14 +31,17 @@
     VQModel,
 )
 from diffusers.utils import floats_tensor, load_image, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu
 from PIL import Image
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
+from ...test_pipelines_common import PipelineTesterMixin
+
 
 torch.backends.cuda.matmul.allow_tf32 = False
 
 
-class PipelineFastTests(unittest.TestCase):
+class StableDiffusionInpaintLegacyPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
     def tearDown(self):
         # clean up the VRAM after each test
         super().tearDown()
@@ -338,8 +341,8 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
 
 
 @slow
-@unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
-class PipelineIntegrationTests(unittest.TestCase):
+@require_torch_gpu
+class StableDiffusionInpaintLegacyPipelineIntegrationTests(unittest.TestCase):
     def tearDown(self):
         # clean up the VRAM after each test
         super().tearDown()
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 025288e03207..a4686366c8e7 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -32,17 +32,7 @@
     DDIMScheduler,
     DDPMPipeline,
     DDPMScheduler,
-    KarrasVePipeline,
-    KarrasVeScheduler,
-    LDMPipeline,
-    LDMTextToImagePipeline,
-    OnnxStableDiffusionImg2ImgPipeline,
-    OnnxStableDiffusionInpaintPipeline,
-    OnnxStableDiffusionPipeline,
-    PNDMPipeline,
     PNDMScheduler,
-    ScoreSdeVePipeline,
-    ScoreSdeVeScheduler,
     StableDiffusionImg2ImgPipeline,
     StableDiffusionInpaintPipelineLegacy,
     StableDiffusionPipeline,
@@ -53,8 +43,8 @@
 )
 from diffusers.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
-from diffusers.utils import CONFIG_NAME, WEIGHTS_NAME, floats_tensor, load_image, slow, torch_device
-from diffusers.utils.testing_utils import CaptureLogger, get_tests_dir
+from diffusers.utils import CONFIG_NAME, WEIGHTS_NAME, floats_tensor, slow, torch_device
+from diffusers.utils.testing_utils import CaptureLogger, get_tests_dir, require_torch_gpu
 from packaging import version
 from PIL import Image
 from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPTokenizer
@@ -145,12 +135,6 @@ def test_load_pipeline_from_git(self):
 
 
 class PipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
     @property
     def dummy_image(self):
         batch_size = 1
@@ -261,174 +245,6 @@ def to(self, device):
 
         return extract
 
-    def test_ddim(self):
-        unet = self.dummy_uncond_unet
-        scheduler = DDIMScheduler()
-
-        ddpm = DDIMPipeline(unet=unet, scheduler=scheduler)
-        ddpm.to(torch_device)
-        ddpm.set_progress_bar_config(disable=None)
-
-        # Warmup pass when using mps (see #372)
-        if torch_device == "mps":
-            _ = ddpm(num_inference_steps=1)
-
-        generator = torch.manual_seed(0)
-        image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images
-
-        generator = torch.manual_seed(0)
-        image_from_tuple = ddpm(generator=generator, num_inference_steps=2, output_type="numpy", return_dict=False)[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [1.000e00, 5.717e-01, 4.717e-01, 1.000e00, 0.000e00, 1.000e00, 3.000e-04, 0.000e00, 9.000e-04]
-        )
-        tolerance = 1e-2 if torch_device != "mps" else 3e-2
-        assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < tolerance
-
-    def test_pndm_cifar10(self):
-        unet = self.dummy_uncond_unet
-        scheduler = PNDMScheduler()
-
-        pndm = PNDMPipeline(unet=unet, scheduler=scheduler)
-        pndm.to(torch_device)
-        pndm.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = pndm(generator=generator, num_inference_steps=20, output_type="numpy").images
-
-        generator = torch.manual_seed(0)
-        image_from_tuple = pndm(generator=generator, num_inference_steps=20, output_type="numpy", return_dict=False)[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_ldm_text2img(self):
-        unet = self.dummy_cond_unet
-        scheduler = DDIMScheduler()
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        ldm = LDMTextToImagePipeline(vqvae=vae, bert=bert, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
-        ldm.to(torch_device)
-        ldm.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-
-        # Warmup pass when using mps (see #372)
-        if torch_device == "mps":
-            generator = torch.manual_seed(0)
-            _ = ldm(
-                [prompt], generator=generator, guidance_scale=6.0, num_inference_steps=1, output_type="numpy"
-            ).images
-
-        generator = torch.manual_seed(0)
-        image = ldm(
-            [prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="numpy"
-        ).images
-
-        generator = torch.manual_seed(0)
-        image_from_tuple = ldm(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="numpy",
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5074, 0.5026, 0.4998, 0.4056, 0.3523, 0.4649, 0.5289, 0.5299, 0.4897])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_score_sde_ve_pipeline(self):
-        unet = self.dummy_uncond_unet
-        scheduler = ScoreSdeVeScheduler()
-
-        sde_ve = ScoreSdeVePipeline(unet=unet, scheduler=scheduler)
-        sde_ve.to(torch_device)
-        sde_ve.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = sde_ve(num_inference_steps=2, output_type="numpy", generator=generator).images
-
-        generator = torch.manual_seed(0)
-        image_from_tuple = sde_ve(num_inference_steps=2, output_type="numpy", generator=generator, return_dict=False)[
-            0
-        ]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_ldm_uncond(self):
-        unet = self.dummy_uncond_unet
-        scheduler = DDIMScheduler()
-        vae = self.dummy_vq_model
-
-        ldm = LDMPipeline(unet=unet, vqvae=vae, scheduler=scheduler)
-        ldm.to(torch_device)
-        ldm.set_progress_bar_config(disable=None)
-
-        # Warmup pass when using mps (see #372)
-        if torch_device == "mps":
-            generator = torch.manual_seed(0)
-            _ = ldm(generator=generator, num_inference_steps=1, output_type="numpy").images
-
-        generator = torch.manual_seed(0)
-        image = ldm(generator=generator, num_inference_steps=2, output_type="numpy").images
-
-        generator = torch.manual_seed(0)
-        image_from_tuple = ldm(generator=generator, num_inference_steps=2, output_type="numpy", return_dict=False)[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.8512, 0.818, 0.6411, 0.6808, 0.4465, 0.5618, 0.46, 0.6231, 0.5172])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_karras_ve_pipeline(self):
-        unet = self.dummy_uncond_unet
-        scheduler = KarrasVeScheduler()
-
-        pipe = KarrasVePipeline(unet=unet, scheduler=scheduler)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = pipe(num_inference_steps=2, generator=generator, output_type="numpy").images
-
-        generator = torch.manual_seed(0)
-        image_from_tuple = pipe(num_inference_steps=2, generator=generator, output_type="numpy", return_dict=False)[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
     def test_components(self):
         """Test that components property works correctly"""
         unet = self.dummy_cond_unet
@@ -489,7 +305,8 @@ def test_components(self):
         assert image_text2img.shape == (1, 128, 128, 3)
 
 
-class PipelineTesterMixin(unittest.TestCase):
+@slow
+class PipelineSlowTests(unittest.TestCase):
     def tearDown(self):
         # clean up the VRAM after each test
         super().tearDown()
@@ -556,7 +373,6 @@ def test_from_pretrained_save_pretrained(self):
 
         assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass"
 
-    @slow
     def test_from_pretrained_hub(self):
         model_path = "google/ddpm-cifar10-32"
 
@@ -577,7 +393,6 @@ def test_from_pretrained_hub(self):
 
         assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass"
 
-    @slow
     def test_from_pretrained_hub_pass_model(self):
         model_path = "google/ddpm-cifar10-32"
 
@@ -601,7 +416,6 @@ def test_from_pretrained_hub_pass_model(self):
 
         assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass"
 
-    @slow
     def test_output_format(self):
         model_path = "google/ddpm-cifar10-32"
 
@@ -624,156 +438,6 @@ def test_output_format(self):
         assert isinstance(images, list)
         assert isinstance(images[0], PIL.Image.Image)
 
-    @slow
-    def test_ddpm_cifar10(self):
-        model_id = "google/ddpm-cifar10-32"
-
-        unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = DDPMScheduler.from_config(model_id)
-
-        ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
-        ddpm.to(torch_device)
-        ddpm.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = ddpm(generator=generator, output_type="numpy").images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.41995, 0.35885, 0.19385, 0.38475, 0.3382, 0.2647, 0.41545, 0.3582, 0.33845])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
-    def test_ddim_lsun(self):
-        model_id = "google/ddpm-ema-bedroom-256"
-
-        unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = DDIMScheduler.from_config(model_id)
-
-        ddpm = DDIMPipeline(unet=unet, scheduler=scheduler)
-        ddpm.to(torch_device)
-        ddpm.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = ddpm(generator=generator, output_type="numpy").images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.00605, 0.0201, 0.0344, 0.00235, 0.00185, 0.00025, 0.00215, 0.0, 0.00685])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
-    def test_ddim_cifar10(self):
-        model_id = "google/ddpm-cifar10-32"
-
-        unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = DDIMScheduler()
-
-        ddim = DDIMPipeline(unet=unet, scheduler=scheduler)
-        ddim.to(torch_device)
-        ddim.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = ddim(generator=generator, eta=0.0, output_type="numpy").images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.17235, 0.16175, 0.16005, 0.16255, 0.1497, 0.1513, 0.15045, 0.1442, 0.1453])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
-    def test_pndm_cifar10(self):
-        model_id = "google/ddpm-cifar10-32"
-
-        unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = PNDMScheduler()
-
-        pndm = PNDMPipeline(unet=unet, scheduler=scheduler)
-        pndm.to(torch_device)
-        pndm.set_progress_bar_config(disable=None)
-        generator = torch.manual_seed(0)
-        image = pndm(generator=generator, output_type="numpy").images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.1564, 0.14645, 0.1406, 0.14715, 0.12425, 0.14045, 0.13115, 0.12175, 0.125])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
-    def test_ldm_text2img(self):
-        ldm = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256")
-        ldm.to(torch_device)
-        ldm.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-        image = ldm(
-            [prompt], generator=generator, guidance_scale=6.0, num_inference_steps=20, output_type="numpy"
-        ).images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.9256, 0.9340, 0.8933, 0.9361, 0.9113, 0.8727, 0.9122, 0.8745, 0.8099])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
-    def test_ldm_text2img_fast(self):
-        ldm = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256")
-        ldm.to(torch_device)
-        ldm.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-        image = ldm(prompt, generator=generator, num_inference_steps=1, output_type="numpy").images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.3163, 0.8670, 0.6465, 0.1865, 0.6291, 0.5139, 0.2824, 0.3723, 0.4344])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
-    def test_score_sde_ve_pipeline(self):
-        model_id = "google/ncsnpp-church-256"
-        model = UNet2DModel.from_pretrained(model_id)
-
-        scheduler = ScoreSdeVeScheduler.from_config(model_id)
-
-        sde_ve = ScoreSdeVePipeline(unet=model, scheduler=scheduler)
-        sde_ve.to(torch_device)
-        sde_ve.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = sde_ve(num_inference_steps=10, output_type="numpy", generator=generator).images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 256, 256, 3)
-
-        expected_slice = np.array([0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
-    def test_ldm_uncond(self):
-        ldm = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256")
-        ldm.to(torch_device)
-        ldm.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = ldm(generator=generator, num_inference_steps=5, output_type="numpy").images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.4399, 0.44975, 0.46825, 0.474, 0.4359, 0.4581, 0.45095, 0.4341, 0.4447])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
     def test_ddpm_ddim_equality(self):
         model_id = "google/ddpm-cifar10-32"
 
@@ -824,145 +488,7 @@ def test_ddpm_ddim_equality_batched(self):
         # the values aren't exactly equal, but the images look the same visually
         assert np.abs(ddpm_images - ddim_images).max() < 1e-1
 
-    @slow
-    def test_karras_ve_pipeline(self):
-        model_id = "google/ncsnpp-celebahq-256"
-        model = UNet2DModel.from_pretrained(model_id)
-        scheduler = KarrasVeScheduler()
-
-        pipe = KarrasVePipeline(unet=model, scheduler=scheduler)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = pipe(num_inference_steps=20, generator=generator, output_type="numpy").images
-
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.578, 0.5811, 0.5924, 0.5809, 0.587, 0.5886, 0.5861, 0.5802, 0.586])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
-    def test_stable_diffusion_onnx(self):
-        sd_pipe = OnnxStableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="onnx", provider="CPUExecutionProvider"
-        )
-
-        prompt = "A painting of a squirrel eating a burger"
-        np.random.seed(0)
-        output = sd_pipe([prompt], guidance_scale=6.0, num_inference_steps=5, output_type="np")
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.3602, 0.3688, 0.3652, 0.3895, 0.3782, 0.3747, 0.3927, 0.4241, 0.4327])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    @slow
-    def test_stable_diffusion_img2img_onnx(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/sketch-mountains-input.jpg"
-        )
-        init_image = init_image.resize((768, 512))
-        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="onnx", provider="CPUExecutionProvider"
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A fantasy landscape, trending on artstation"
-
-        np.random.seed(0)
-        output = pipe(
-            prompt=prompt,
-            init_image=init_image,
-            strength=0.75,
-            guidance_scale=7.5,
-            num_inference_steps=8,
-            output_type="np",
-        )
-        images = output.images
-        image_slice = images[0, 255:258, 383:386, -1]
-
-        assert images.shape == (1, 512, 768, 3)
-        expected_slice = np.array([0.4830, 0.5242, 0.5603, 0.5016, 0.5131, 0.5111, 0.4928, 0.5025, 0.5055])
-        # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2
-
-    @slow
-    def test_stable_diffusion_inpaint_onnx(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
-        )
-
-        pipe = OnnxStableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting", revision="onnx", provider="CPUExecutionProvider"
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A red cat sitting on a park bench"
-
-        np.random.seed(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            mask_image=mask_image,
-            guidance_scale=7.5,
-            num_inference_steps=8,
-            output_type="np",
-        )
-        images = output.images
-        image_slice = images[0, 255:258, 255:258, -1]
-
-        assert images.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.2951, 0.2955, 0.2922, 0.2036, 0.1977, 0.2279, 0.1716, 0.1641, 0.1799])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    @slow
-    def test_stable_diffusion_onnx_intermediate_state(self):
-        number_of_steps = 0
-
-        def test_callback_fn(step: int, timestep: int, latents: np.ndarray) -> None:
-            test_callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 0:
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.5950, -0.3039, -1.1672, 0.1594, -1.1572, 0.6719, -1.9712, -0.0403, 0.9592]
-                )
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
-            elif step == 5:
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.4776, -0.0119, -0.8519, -0.0275, -0.9764, 0.9820, -0.3843, 0.3788, 1.2264]
-                )
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
-
-        test_callback_fn.has_been_called = False
-
-        pipe = OnnxStableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="onnx", provider="CPUExecutionProvider"
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "Andromeda galaxy in a bottle"
-
-        np.random.seed(0)
-        pipe(prompt=prompt, num_inference_steps=5, guidance_scale=7.5, callback=test_callback_fn, callback_steps=1)
-        assert test_callback_fn.has_been_called
-        assert number_of_steps == 6
-
-    @slow
-    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
+    @require_torch_gpu
     def test_stable_diffusion_accelerate_load_works(self):
         if version.parse(version.parse(transformers.__version__).base_version) < version.parse("4.23"):
             return
@@ -975,8 +501,7 @@ def test_stable_diffusion_accelerate_load_works(self):
             model_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto"
         ).to(torch_device)
 
-    @slow
-    @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU")
+    @require_torch_gpu
     def test_stable_diffusion_accelerate_load_reduces_memory_footprint(self):
         if version.parse(version.parse(transformers.__version__).base_version) < version.parse("4.23"):
             return
diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py
new file mode 100644
index 000000000000..bf99925f1c4f
--- /dev/null
+++ b/tests/test_pipelines_common.py
@@ -0,0 +1,12 @@
+from diffusers.utils.testing_utils import require_torch
+
+
+@require_torch
+class PipelineTesterMixin:
+    """
+    This mixin is designed to be used with unittest.TestCase classes.
+    It provides a set of common tests for each PyTorch pipeline, e.g. saving and loading the pipeline,
+    equivalence of dict and tuple outputs, etc.
+    """
+
+    pass
diff --git a/tests/test_pipelines_flax.py b/tests/test_pipelines_flax.py
index 9256944815c7..436e139d9162 100644
--- a/tests/test_pipelines_flax.py
+++ b/tests/test_pipelines_flax.py
@@ -30,8 +30,8 @@
     from jax import pmap
 
 
-@require_flax
 @slow
+@require_flax
 class FlaxPipelineTests(unittest.TestCase):
     def test_dummy_all_tpus(self):
         pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
diff --git a/tests/test_pipelines_onnx_common.py b/tests/test_pipelines_onnx_common.py
new file mode 100644
index 000000000000..575ecd007531
--- /dev/null
+++ b/tests/test_pipelines_onnx_common.py
@@ -0,0 +1,12 @@
+from diffusers.utils.testing_utils import require_onnxruntime
+
+
+@require_onnxruntime
+class OnnxPipelineTesterMixin:
+    """
+    This mixin is designed to be used with unittest.TestCase classes.
+    It provides a set of common tests for each ONNXRuntime pipeline, e.g. saving and loading the pipeline,
+    equivalence of dict and tuple outputs, etc.
+    """
+
+    pass

From 8aac1f99d7af5873db7d23c07fba370d0f5061a6 Mon Sep 17 00:00:00 2001
From: apolinario <joaopaulo.passos@gmail.com>
Date: Mon, 24 Oct 2022 22:50:23 +0200
Subject: [PATCH 034/269] v1-5 docs updates (#921)

* Update README.md

Additionally add FLAX so the model card can be slimmer and point to this page

* Find and replace all

* v-1-5 -> v1-5

* revert test changes

* Update README.md

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* Update docs/source/quicktour.mdx

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

* Update README.md

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

* Update docs/source/quicktour.mdx

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

* Update README.md

Co-authored-by: Suraj Patil <surajp815@gmail.com>

* Revert certain references to v1-5

* Docs changes

* Apply suggestions from code review

Co-authored-by: apolinario <joaopaulo.passos+multimodal@gmail.com>
Co-authored-by: anton-l <anton@huggingface.co>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
Co-authored-by: Suraj Patil <surajp815@gmail.com>
---
 README.md                                     | 113 +++++++++++++++---
 docs/source/api/pipelines/overview.mdx        |   8 +-
 docs/source/optimization/fp16.mdx             |  10 +-
 docs/source/optimization/mps.mdx              |   2 +-
 docs/source/optimization/onnx.mdx             |   2 +-
 docs/source/quicktour.mdx                     |  15 ++-
 docs/source/training/text_inversion.mdx       |   4 +-
 .../using-diffusers/custom_pipelines.mdx      |   2 +-
 docs/source/using-diffusers/img2img.mdx       |   2 +-
 examples/community/README.md                  |   6 +-
 examples/community/stable_diffusion_mega.py   |   2 +-
 examples/test_examples.py                     |   2 +-
 examples/textual_inversion/README.md          |   2 +-
 src/diffusers/modeling_flax_utils.py          |  14 +--
 src/diffusers/pipeline_flax_utils.py          |   6 +-
 src/diffusers/pipeline_utils.py               |   6 +-
 src/diffusers/pipelines/README.md             |   8 +-
 .../pipelines/stable_diffusion/README.md      |  16 +--
 .../pipeline_flax_stable_diffusion.py         |   2 +-
 .../pipeline_onnx_stable_diffusion_img2img.py |   2 +-
 .../pipeline_onnx_stable_diffusion_inpaint.py |   2 +-
 .../pipeline_stable_diffusion.py              |   2 +-
 .../pipeline_stable_diffusion_img2img.py      |   2 +-
 .../pipeline_stable_diffusion_inpaint.py      |   2 +-
 ...ipeline_stable_diffusion_inpaint_legacy.py |   2 +-
 25 files changed, 156 insertions(+), 78 deletions(-)

diff --git a/README.md b/README.md
index fe006f41c2c6..1dc1d7838a5d 100644
--- a/README.md
+++ b/README.md
@@ -64,44 +64,54 @@ In order to get started, we recommend taking a look at two notebooks:
 - The [Training a diffusers model](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb) notebook summarizes diffusion models training methods. This notebook takes a step-by-step approach to training your
   diffusion models on an image dataset, with explanatory graphics. 
   
-## **New** Stable Diffusion is now fully compatible with `diffusers`!  
+## Stable Diffusion is fully compatible with `diffusers`!  
 
-Stable Diffusion is a text-to-image latent diffusion model created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/) and [LAION](https://laion.ai/). It's trained on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) database. This model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts. With its 860M UNet and 123M text encoder, the model is relatively lightweight and runs on a GPU with at least 10GB VRAM.
+Stable Diffusion is a text-to-image latent diffusion model created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/), [LAION](https://laion.ai/) and [RunwayML](https://runwayml.com/). It's trained on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) database. This model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts. With its 860M UNet and 123M text encoder, the model is relatively lightweight and runs on a GPU with at least 4GB VRAM.
 See the [model card](https://huggingface.co/CompVis/stable-diffusion) for more information.
 
-You need to accept the model license before downloading or using the Stable Diffusion weights. Please, visit the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section](https://huggingface.co/docs/hub/security-tokens) of the documentation.
+You need to accept the model license before downloading or using the Stable Diffusion weights. Please, visit the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5), read the license carefully and tick the checkbox if you agree. You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section](https://huggingface.co/docs/hub/security-tokens) of the documentation.
 
 
 ### Text-to-Image generation with Stable Diffusion
 
+First let's install
+```bash
+pip install --upgrade diffusers transformers scipy
+```
+
+Run this command to log in with your HF Hub token if you haven't before (you can skip this step if you prefer to run the model locally, follow [this](#running-the-model-locally) instead)
+```bash
+huggingface-cli login
+```
+
 We recommend using the model in [half-precision (`fp16`)](https://pytorch.org/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/) as it gives almost always the same results as full
 precision while being roughly twice as fast and requiring half the amount of GPU RAM.
 
 ```python
-# make sure you're logged in with `huggingface-cli login`
 from diffusers import StableDiffusionPipeline
 
-pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_type=torch.float16, revision="fp16")
+pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_type=torch.float16, revision="fp16")
 pipe = pipe.to("cuda")
 
 prompt = "a photo of an astronaut riding a horse on mars"
 image = pipe(prompt).images[0]  
 ```
 
-**Note**: If you don't want to use the token, you can also simply download the model weights
-(after having [accepted the license](https://huggingface.co/CompVis/stable-diffusion-v1-4)) and pass
+#### Running the model locally
+If you don't want to login to Hugging Face, you can also simply download the model folder
+(after having [accepted the license](https://huggingface.co/runwayml/stable-diffusion-v1-5)) and pass
 the path to the local folder to the `StableDiffusionPipeline`.
 
 ```
 git lfs install
-git clone https://huggingface.co/CompVis/stable-diffusion-v1-4
+git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
 ```
 
-Assuming the folder is stored locally under `./stable-diffusion-v1-4`, you can also run stable diffusion
+Assuming the folder is stored locally under `./stable-diffusion-v1-5`, you can also run stable diffusion
 without requiring an authentication token:
 
 ```python
-pipe = StableDiffusionPipeline.from_pretrained("./stable-diffusion-v1-4")
+pipe = StableDiffusionPipeline.from_pretrained("./stable-diffusion-v1-5")
 pipe = pipe.to("cuda")
 
 prompt = "a photo of an astronaut riding a horse on mars"
@@ -114,7 +124,7 @@ The following snippet should result in less than 4GB VRAM.
 
 ```python
 pipe = StableDiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4", 
+    "runwayml/stable-diffusion-v1-5", 
     revision="fp16", 
     torch_dtype=torch.float16,
 )
@@ -125,7 +135,7 @@ pipe.enable_attention_slicing()
 image = pipe(prompt).images[0]  
 ```
 
-If you wish to use a different scheduler, you can simply instantiate
+If you wish to use a different scheduler (e.g.: DDIM, LMS, PNDM/PLMS), you can instantiate
 it before the pipeline and pass it to `from_pretrained`.
     
 ```python
@@ -138,7 +148,7 @@ lms = LMSDiscreteScheduler(
 )
 
 pipe = StableDiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4", 
+    "runwayml/stable-diffusion-v1-5", 
     revision="fp16", 
     torch_dtype=torch.float16,
     scheduler=lms,
@@ -158,7 +168,7 @@ please run the model in the default *full-precision* setting:
 # make sure you're logged in with `huggingface-cli login`
 from diffusers import StableDiffusionPipeline
 
-pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 
 # disable the following line if you run on CPU
 pipe = pipe.to("cuda")
@@ -169,6 +179,75 @@ image = pipe(prompt).images[0]
 image.save("astronaut_rides_horse.png")
 ```
 
+### JAX/Flax
+
+To use StableDiffusion on TPUs and GPUs for faster inference you can leverage JAX/Flax.
+
+Running the pipeline with default PNDMScheduler
+
+```python
+import jax
+import numpy as np
+from flax.jax_utils import replicate
+from flax.training.common_utils import shard
+
+from diffusers import FlaxStableDiffusionPipeline
+
+pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", revision="flax", dtype=jax.numpy.bfloat16
+)
+
+prompt = "a photo of an astronaut riding a horse on mars"
+
+prng_seed = jax.random.PRNGKey(0)
+num_inference_steps = 50
+
+num_samples = jax.device_count()
+prompt = num_samples * [prompt]
+prompt_ids = pipeline.prepare_inputs(prompt)
+
+# shard inputs and rng
+params = replicate(params)
+prng_seed = jax.random.split(prng_seed, jax.device_count())
+prompt_ids = shard(prompt_ids)
+
+images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
+images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
+```
+
+**Note**:
+If you are limited by TPU memory, please make sure to load the `FlaxStableDiffusionPipeline` in `bfloat16` precision instead of the default `float32` precision as done above. You can do so by telling diffusers to load the weights from "bf16" branch.
+
+```python
+import jax
+import numpy as np
+from flax.jax_utils import replicate
+from flax.training.common_utils import shard
+
+from diffusers import FlaxStableDiffusionPipeline
+
+pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", revision="bf16", dtype=jax.numpy.bfloat16
+)
+
+prompt = "a photo of an astronaut riding a horse on mars"
+
+prng_seed = jax.random.PRNGKey(0)
+num_inference_steps = 50
+
+num_samples = jax.device_count()
+prompt = num_samples * [prompt]
+prompt_ids = pipeline.prepare_inputs(prompt)
+
+# shard inputs and rng
+params = replicate(params)
+prng_seed = jax.random.split(prng_seed, jax.device_count())
+prompt_ids = shard(prompt_ids)
+
+images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
+images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
+```
+
 ### Image-to-Image text-guided generation with Stable Diffusion
 
 The `StableDiffusionImg2ImgPipeline` lets you pass a text prompt and an initial image to condition the generation of new images.
@@ -183,14 +262,14 @@ from diffusers import StableDiffusionImg2ImgPipeline
 
 # load the pipeline
 device = "cuda"
-model_id_or_path = "CompVis/stable-diffusion-v1-4"
+model_id_or_path = "runwayml/stable-diffusion-v1-5"
 pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
     model_id_or_path,
     revision="fp16", 
     torch_dtype=torch.float16,
 )
-# or download via git clone https://huggingface.co/CompVis/stable-diffusion-v1-4
-# and pass `model_id_or_path="./stable-diffusion-v1-4"`.
+# or download via git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
+# and pass `model_id_or_path="./stable-diffusion-v1-5"`.
 pipe = pipe.to(device)
 
 # let's download an initial image
diff --git a/docs/source/api/pipelines/overview.mdx b/docs/source/api/pipelines/overview.mdx
index 3082f99665f2..af711a02d9f3 100644
--- a/docs/source/api/pipelines/overview.mdx
+++ b/docs/source/api/pipelines/overview.mdx
@@ -67,8 +67,8 @@ Diffusion models often consist of multiple independently-trained models or other
 Each model has been trained independently on a different task and the scheduler can easily be swapped out and replaced with a different one. 
 During inference, we however want to be able to easily load all components and use them in inference - even if one component, *e.g.* CLIP's text encoder, originates from a different library, such as [Transformers](https://github.com/huggingface/transformers). To that end, all pipelines provide the following functionality:
 
-- [`from_pretrained` method](../diffusion_pipeline) that accepts a Hugging Face Hub repository id, *e.g.* [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) or a path to a local directory, *e.g.*
-"./stable-diffusion". To correctly retrieve which models and components should be loaded, one has to provide a `model_index.json` file, *e.g.* [CompVis/stable-diffusion-v1-4/model_index.json](https://huggingface.co/CompVis/stable-diffusion-v1-4/blob/main/model_index.json), which defines all components that should be 
+- [`from_pretrained` method](../diffusion_pipeline) that accepts a Hugging Face Hub repository id, *e.g.* [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) or a path to a local directory, *e.g.*
+"./stable-diffusion". To correctly retrieve which models and components should be loaded, one has to provide a `model_index.json` file, *e.g.* [runwayml/stable-diffusion-v1-5/model_index.json](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/model_index.json), which defines all components that should be
 loaded into the pipelines. More specifically, for each model/component one needs to define the format `<name>: ["<library>", "<class name>"]`. `<name>` is the attribute name given to the loaded instance of `<class name>` which can be found in the library or pipeline folder called `"<library>"`.
 - [`save_pretrained`](../diffusion_pipeline) that accepts a local path, *e.g.* `./stable-diffusion` under which all models/components of the pipeline will be saved. For each component/model a folder is created inside the local path that is named after the given attribute name, *e.g.* `./stable_diffusion/unet`. 
 In addition, a `model_index.json` file is created at the root of the local path, *e.g.* `./stable_diffusion/model_index.json` so that the complete pipeline can again be instantiated 
@@ -100,7 +100,7 @@ logic including pre-processing, an unrolled diffusion loop, and post-processing
 # make sure you're logged in with `huggingface-cli login`
 from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
 
-pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 pipe = pipe.to("cuda")
 
 prompt = "a photo of an astronaut riding a horse on mars"
@@ -123,7 +123,7 @@ from diffusers import StableDiffusionImg2ImgPipeline
 # load the pipeline
 device = "cuda"
 pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
+    "runwayml/stable-diffusion-v1-5", revision="fp16", torch_dtype=torch.float16
 ).to(device)
 
 # let's download an initial image
diff --git a/docs/source/optimization/fp16.mdx b/docs/source/optimization/fp16.mdx
index b561aedbfe4a..d1dd87f7652f 100644
--- a/docs/source/optimization/fp16.mdx
+++ b/docs/source/optimization/fp16.mdx
@@ -56,7 +56,7 @@ If you use a CUDA GPU, you can take advantage of `torch.autocast` to perform inf
 from torch import autocast
 from diffusers import StableDiffusionPipeline
 
-pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 pipe = pipe.to("cuda")
 
 prompt = "a photo of an astronaut riding a horse on mars"
@@ -72,7 +72,7 @@ To save more GPU memory and get even more speed, you can load and run the model
 
 ```Python
 pipe = StableDiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
+    "runwayml/stable-diffusion-v1-5",
     revision="fp16",
     torch_dtype=torch.float16,
 )
@@ -97,7 +97,7 @@ import torch
 from diffusers import StableDiffusionPipeline
 
 pipe = StableDiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
+    "runwayml/stable-diffusion-v1-5",
     revision="fp16",
     torch_dtype=torch.float16,
 )
@@ -152,7 +152,7 @@ def generate_inputs():
 
 
 pipe = StableDiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
+    "runwayml/stable-diffusion-v1-5",
     revision="fp16",
     torch_dtype=torch.float16,
 ).to("cuda")
@@ -216,7 +216,7 @@ class UNet2DConditionOutput:
 
 
 pipe = StableDiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
+    "runwayml/stable-diffusion-v1-5",
     revision="fp16",
     torch_dtype=torch.float16,
 ).to("cuda")
diff --git a/docs/source/optimization/mps.mdx b/docs/source/optimization/mps.mdx
index ff9d614c870f..754adae5d9b2 100644
--- a/docs/source/optimization/mps.mdx
+++ b/docs/source/optimization/mps.mdx
@@ -31,7 +31,7 @@ We recommend to "prime" the pipeline using an additional one-time pass through i
 # make sure you're logged in with `huggingface-cli login`
 from diffusers import StableDiffusionPipeline
 
-pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 pipe = pipe.to("mps")
 
 prompt = "a photo of an astronaut riding a horse on mars"
diff --git a/docs/source/optimization/onnx.mdx b/docs/source/optimization/onnx.mdx
index 9bbc4f2077c2..e79efbde0742 100644
--- a/docs/source/optimization/onnx.mdx
+++ b/docs/source/optimization/onnx.mdx
@@ -28,7 +28,7 @@ The snippet below demonstrates how to use the ONNX runtime. You need to use `Sta
 from diffusers import StableDiffusionOnnxPipeline
 
 pipe = StableDiffusionOnnxPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
+    "runwayml/stable-diffusion-v1-5",
     revision="onnx",
     provider="CUDAExecutionProvider",
 )
diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx
index 9574ecac4a6a..a652695f00ba 100644
--- a/docs/source/quicktour.mdx
+++ b/docs/source/quicktour.mdx
@@ -68,8 +68,7 @@ You can save the image by simply calling:
 
 More advanced models, like [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion) require you to accept a [license](https://huggingface.co/spaces/CompVis/stable-diffusion-license) before running the model.
 This is due to the improved image generation capabilities of the model and the potentially harmful content that could be produced with it.
-Long story short: Head over to your stable diffusion model of choice, *e.g.* [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4), read through the license and click-accept to get 
-access to the model. 
+Please, head over to your stable diffusion model of choice, *e.g.*  [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5), read the license carefully and tick the checkbox if you agree.
 You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
 Having "click-accepted" the license, you can save your token:
 
@@ -77,13 +76,13 @@ Having "click-accepted" the license, you can save your token:
 AUTH_TOKEN = "<please-fill-with-your-token>"
 ```
 
-You can then load [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4) 
+You can then load [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)
 just like we did before only that now you need to pass your `AUTH_TOKEN`:
 
 ```python
 >>> from diffusers import DiffusionPipeline
 
->>> generator = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=AUTH_TOKEN)
+>>> generator = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_auth_token=AUTH_TOKEN)
 ```
 
 If you do not pass your authentication token you will see that the diffusion system will not be correctly 
@@ -95,15 +94,15 @@ the weights locally via:
 
 ```
 git lfs install
-git clone https://huggingface.co/CompVis/stable-diffusion-v1-4
+git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
 ```
 
 and then load locally saved weights into the pipeline. This way, you do not need to pass an authentication
-token. Assuming that `"./stable-diffusion-v1-4"` is the local path to the cloned stable-diffusion-v1-4 repo,
+token. Assuming that `"./stable-diffusion-v1-5"` is the local path to the cloned stable-diffusion-v1-5 repo,
 you can also load the pipeline as follows:
 
 ```python
->>> generator = DiffusionPipeline.from_pretrained("./stable-diffusion-v1-4")
+>>> generator = DiffusionPipeline.from_pretrained("./stable-diffusion-v1-5")
 ```
 
 Running the pipeline is then identical to the code above as it's the same model architecture.
@@ -125,7 +124,7 @@ you could use it as follows:
 >>> scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
 
 >>> generator = StableDiffusionPipeline.from_pretrained(
-...     "CompVis/stable-diffusion-v1-4", scheduler=scheduler, use_auth_token=AUTH_TOKEN
+...     "runwayml/stable-diffusion-v1-5", scheduler=scheduler, use_auth_token=AUTH_TOKEN
 ... )
 ```
 
diff --git a/docs/source/training/text_inversion.mdx b/docs/source/training/text_inversion.mdx
index 13ea7c942b4e..d44cef301ed7 100644
--- a/docs/source/training/text_inversion.mdx
+++ b/docs/source/training/text_inversion.mdx
@@ -64,7 +64,7 @@ accelerate config
 
 ### Cat toy example
 
-You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. 
+You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree.
 
 You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
 
@@ -83,7 +83,7 @@ Now let's get our dataset.Download 3-4 images from [here](https://drive.google.c
 And launch the training using
 
 ```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export MODEL_NAME="runwayml/stable-diffusion-v1-5"
 export DATA_DIR="path-to-dir-containing-images"
 
 accelerate launch textual_inversion.py \
diff --git a/docs/source/using-diffusers/custom_pipelines.mdx b/docs/source/using-diffusers/custom_pipelines.mdx
index b52d405581b1..af973660a86e 100644
--- a/docs/source/using-diffusers/custom_pipelines.mdx
+++ b/docs/source/using-diffusers/custom_pipelines.mdx
@@ -58,7 +58,7 @@ feature_extractor = CLIPFeatureExtractor.from_pretrained(clip_model_id)
 clip_model = CLIPModel.from_pretrained(clip_model_id)
 
 pipeline = DiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
+    "runwayml/stable-diffusion-v1-5",
     custom_pipeline="clip_guided_stable_diffusion",
     clip_model=clip_model,
     feature_extractor=feature_extractor,
diff --git a/docs/source/using-diffusers/img2img.mdx b/docs/source/using-diffusers/img2img.mdx
index 62eaeea911c9..defefffe063e 100644
--- a/docs/source/using-diffusers/img2img.mdx
+++ b/docs/source/using-diffusers/img2img.mdx
@@ -25,7 +25,7 @@ from diffusers import StableDiffusionImg2ImgPipeline
 # load the pipeline
 device = "cuda"
 pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
+    "runwayml/stable-diffusion-v1-5", revision="fp16", torch_dtype=torch.float16
 ).to(device)
 
 # let's download an initial image
diff --git a/examples/community/README.md b/examples/community/README.md
index a5b2252d4408..7ecaffb33114 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -18,7 +18,7 @@ If a community doesn't work as expected, please open an issue and ping the autho
 
 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
 ```py
-pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", custom_pipeline="filename_in_the_community_folder")
+pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", custom_pipeline="filename_in_the_community_folder")
 ```
 
 ## Example usages
@@ -41,7 +41,7 @@ clip_model = CLIPModel.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
 
 
 guided_pipeline = DiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
+    "runwayml/stable-diffusion-v1-5",
     custom_pipeline="clip_guided_stable_diffusion",
     clip_model=clip_model,
     feature_extractor=feature_extractor,
@@ -141,7 +141,7 @@ def download_image(url):
     response = requests.get(url)
     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
 
-pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", custom_pipeline="stable_diffusion_mega", torch_dtype=torch.float16, revision="fp16")
+pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", custom_pipeline="stable_diffusion_mega", torch_dtype=torch.float16, revision="fp16")
 pipe.to("cuda")
 pipe.enable_attention_slicing()
 
diff --git a/examples/community/stable_diffusion_mega.py b/examples/community/stable_diffusion_mega.py
index b253370fa2a6..723951941576 100644
--- a/examples/community/stable_diffusion_mega.py
+++ b/examples/community/stable_diffusion_mega.py
@@ -46,7 +46,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionMegaSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
diff --git a/examples/test_examples.py b/examples/test_examples.py
index 15c8e05a5c12..eb86b18b75f0 100644
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -101,7 +101,7 @@ def test_textual_inversion(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             test_args = f"""
                 examples/textual_inversion/textual_inversion.py
-                --pretrained_model_name_or_path CompVis/stable-diffusion-v1-4
+                --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5
                 --train_data_dir docs/source/imgs
                 --learnable_property object
                 --placeholder_token <cat-toy>
diff --git a/examples/textual_inversion/README.md b/examples/textual_inversion/README.md
index 05d8ffb8c9f2..801c42c8a0f8 100644
--- a/examples/textual_inversion/README.md
+++ b/examples/textual_inversion/README.md
@@ -48,7 +48,7 @@ Now let's get our dataset.Download 3-4 images from [here](https://drive.google.c
 And launch the training using
 
 ```bash
-export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export MODEL_NAME="runwayml/stable-diffusion-v1-5"
 export DATA_DIR="path-to-dir-containing-images"
 
 accelerate launch textual_inversion.py \
diff --git a/src/diffusers/modeling_flax_utils.py b/src/diffusers/modeling_flax_utils.py
index e31a9b7b80d0..5ef100224907 100644
--- a/src/diffusers/modeling_flax_utils.py
+++ b/src/diffusers/modeling_flax_utils.py
@@ -105,14 +105,14 @@ def to_bf16(self, params: Union[Dict, FrozenDict], mask: Any = None):
         >>> from diffusers import FlaxUNet2DConditionModel
 
         >>> # load model
-        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4")
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
         >>> # By default, the model parameters will be in fp32 precision, to cast these to bfloat16 precision
         >>> params = model.to_bf16(params)
         >>> # If you don't want to cast certain parameters (for example layer norm bias and scale)
         >>> # then pass the mask as follows
         >>> from flax import traverse_util
 
-        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4")
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
         >>> flat_params = traverse_util.flatten_dict(params)
         >>> mask = {
         ...     path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale"))
@@ -141,7 +141,7 @@ def to_fp32(self, params: Union[Dict, FrozenDict], mask: Any = None):
         >>> from diffusers import FlaxUNet2DConditionModel
 
         >>> # Download model and configuration from huggingface.co
-        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4")
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
         >>> # By default, the model params will be in fp32, to illustrate the use of this method,
         >>> # we'll first cast to fp16 and back to fp32
         >>> params = model.to_f16(params)
@@ -171,14 +171,14 @@ def to_fp16(self, params: Union[Dict, FrozenDict], mask: Any = None):
         >>> from diffusers import FlaxUNet2DConditionModel
 
         >>> # load model
-        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4")
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
         >>> # By default, the model params will be in fp32, to cast these to float16
         >>> params = model.to_fp16(params)
         >>> # If you want don't want to cast certain parameters (for example layer norm bias and scale)
         >>> # then pass the mask as follows
         >>> from flax import traverse_util
 
-        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4")
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
         >>> flat_params = traverse_util.flatten_dict(params)
         >>> mask = {
         ...     path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale"))
@@ -216,7 +216,7 @@ def from_pretrained(
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                       Valid model ids are namespaced under a user or organization name, like
-                      `CompVis/stable-diffusion-v1-4`.
+                      `runwayml/stable-diffusion-v1-5`.
                     - A path to a *directory* containing model weights saved using [`~ModelMixin.save_pretrained`],
                       e.g., `./my_model_directory/`.
             dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
@@ -273,7 +273,7 @@ def from_pretrained(
         >>> from diffusers import FlaxUNet2DConditionModel
 
         >>> # Download model and configuration from huggingface.co and cache.
-        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4")
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
         >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
         >>> model, params = FlaxUNet2DConditionModel.from_pretrained("./test/saved_model/")
         ```"""
diff --git a/src/diffusers/pipeline_flax_utils.py b/src/diffusers/pipeline_flax_utils.py
index d55338b50343..3c23693b40ef 100644
--- a/src/diffusers/pipeline_flax_utils.py
+++ b/src/diffusers/pipeline_flax_utils.py
@@ -244,7 +244,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         <Tip>
 
          It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
-         models](https://huggingface.co/docs/hub/models-gated#gated-models), *e.g.* `"CompVis/stable-diffusion-v1-4"`
+         models](https://huggingface.co/docs/hub/models-gated#gated-models), *e.g.* `"runwayml/stable-diffusion-v1-5"`
 
         </Tip>
 
@@ -266,13 +266,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         >>> # Download pipeline that requires an authorization token
         >>> # For more information on access tokens, please refer to this section
         >>> # of the documentation](https://huggingface.co/docs/hub/security-tokens)
-        >>> pipeline = FlaxDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        >>> pipeline = FlaxDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 
         >>> # Download pipeline, but overwrite scheduler
         >>> from diffusers import LMSDiscreteScheduler
 
         >>> scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
-        >>> pipeline = FlaxDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=scheduler)
+        >>> pipeline = FlaxDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", scheduler=scheduler)
         ```
         """
         cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index d1e037499b65..894505654e47 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -317,7 +317,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         <Tip>
 
          It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
-         models](https://huggingface.co/docs/hub/models-gated#gated-models), *e.g.* `"CompVis/stable-diffusion-v1-4"`
+         models](https://huggingface.co/docs/hub/models-gated#gated-models), *e.g.* `"runwayml/stable-diffusion-v1-5"`
 
         </Tip>
 
@@ -339,13 +339,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         >>> # Download pipeline that requires an authorization token
         >>> # For more information on access tokens, please refer to this section
         >>> # of the documentation](https://huggingface.co/docs/hub/security-tokens)
-        >>> pipeline = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        >>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 
         >>> # Download pipeline, but overwrite scheduler
         >>> from diffusers import LMSDiscreteScheduler
 
         >>> scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
-        >>> pipeline = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=scheduler)
+        >>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", scheduler=scheduler)
         ```
         """
         cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
diff --git a/src/diffusers/pipelines/README.md b/src/diffusers/pipelines/README.md
index 90752d69f25f..66d3a8815d4e 100644
--- a/src/diffusers/pipelines/README.md
+++ b/src/diffusers/pipelines/README.md
@@ -55,8 +55,8 @@ Diffusion models often consist of multiple independently-trained models or other
 Each model has been trained independently on a different task and the scheduler can easily be swapped out and replaced with a different one. 
 During inference, we however want to be able to easily load all components and use them in inference - even if one component, *e.g.* CLIP's text encoder, originates from a different library, such as [Transformers](https://github.com/huggingface/transformers). To that end, all pipelines provide the following functionality:
 
-- [`from_pretrained` method](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L139) that accepts a Hugging Face Hub repository id, *e.g.* [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) or a path to a local directory, *e.g.*
-"./stable-diffusion". To correctly retrieve which models and components should be loaded, one has to provide a `model_index.json` file, *e.g.* [CompVis/stable-diffusion-v1-4/model_index.json](https://huggingface.co/CompVis/stable-diffusion-v1-4/blob/main/model_index.json), which defines all components that should be 
+- [`from_pretrained` method](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L139) that accepts a Hugging Face Hub repository id, *e.g.* [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) or a path to a local directory, *e.g.*
+"./stable-diffusion". To correctly retrieve which models and components should be loaded, one has to provide a `model_index.json` file, *e.g.* [runwayml/stable-diffusion-v1-5/model_index.json](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/model_index.json), which defines all components that should be 
 loaded into the pipelines. More specifically, for each model/component one needs to define the format `<name>: ["<library>", "<class name>"]`. `<name>` is the attribute name given to the loaded instance of `<class name>` which can be found in the library or pipeline folder called `"<library>"`.
 - [`save_pretrained`](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L90) that accepts a local path, *e.g.* `./stable-diffusion` under which all models/components of the pipeline will be saved. For each component/model a folder is created inside the local path that is named after the given attribute name, *e.g.* `./stable_diffusion/unet`. 
 In addition, a `model_index.json` file is created at the root of the local path, *e.g.* `./stable_diffusion/model_index.json` so that the complete pipeline can again be instantiated 
@@ -88,7 +88,7 @@ logic including pre-processing, an unrolled diffusion loop, and post-processing
 # make sure you're logged in with `huggingface-cli login`
 from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
 
-pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 pipe = pipe.to("cuda")
 
 prompt = "a photo of an astronaut riding a horse on mars"
@@ -111,7 +111,7 @@ from diffusers import StableDiffusionImg2ImgPipeline
 # load the pipeline
 device = "cuda"
 pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4",
+    "runwayml/stable-diffusion-v1-5",
     revision="fp16", 
     torch_dtype=torch.float16,
 ).to(device)
diff --git a/src/diffusers/pipelines/stable_diffusion/README.md b/src/diffusers/pipelines/stable_diffusion/README.md
index 47c38acbdb35..9c92e7b1a354 100644
--- a/src/diffusers/pipelines/stable_diffusion/README.md
+++ b/src/diffusers/pipelines/stable_diffusion/README.md
@@ -13,7 +13,7 @@ The summary of the model is the following:
 - Stable Diffusion has the same architecture as [Latent Diffusion](https://arxiv.org/abs/2112.10752) but uses a frozen CLIP Text Encoder instead of training the text encoder jointly with the diffusion model.
 - An in-detail explanation of the Stable Diffusion model can be found under [Stable Diffusion with 🧨 Diffusers](https://huggingface.co/blog/stable_diffusion).
 - If you don't want to rely on the Hugging Face Hub and having to pass a authentication token, you can 
-download the weights with `git lfs install; git clone https://huggingface.co/CompVis/stable-diffusion-v1-4` and instead pass the local path to the cloned folder to `from_pretrained` as shown below.
+download the weights with `git lfs install; git clone https://huggingface.co/runwayml/stable-diffusion-v1-5` and instead pass the local path to the cloned folder to `from_pretrained` as shown below.
 - Stable Diffusion can work with a variety of different samplers as is shown below.
 
 ## Available Pipelines:
@@ -33,14 +33,14 @@ If you want to download the model weights using a single Python line, you need t
 ```python
 from diffusers import DiffusionPipeline
 
-pipeline = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 ```
 
-This however can make it difficult to build applications on top of `diffusers` as you will always have to pass the token around. A potential way to solve this issue is by downloading the weights to a local path `"./stable-diffusion-v1-4"`:
+This however can make it difficult to build applications on top of `diffusers` as you will always have to pass the token around. A potential way to solve this issue is by downloading the weights to a local path `"./stable-diffusion-v1-5"`:
 
 ```
 git lfs install
-git clone https://huggingface.co/CompVis/stable-diffusion-v1-4
+git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
 ```
 
 and simply passing the local path to `from_pretrained`:
@@ -48,7 +48,7 @@ and simply passing the local path to `from_pretrained`:
 ```python
 from diffusers import StableDiffusionPipeline
 
-pipe = StableDiffusionPipeline.from_pretrained("./stable-diffusion-v1-4")
+pipe = StableDiffusionPipeline.from_pretrained("./stable-diffusion-v1-5")
 ```
 
 ### Text-to-Image with default PLMS scheduler
@@ -57,7 +57,7 @@ pipe = StableDiffusionPipeline.from_pretrained("./stable-diffusion-v1-4")
 # make sure you're logged in with `huggingface-cli login`
 from diffusers import StableDiffusionPipeline
 
-pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 pipe = pipe.to("cuda")
 
 prompt = "a photo of an astronaut riding a horse on mars"
@@ -75,7 +75,7 @@ from diffusers import StableDiffusionPipeline, DDIMScheduler
 scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
 
 pipe = StableDiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4", 
+    "runwayml/stable-diffusion-v1-5", 
     scheduler=scheduler,
 ).to("cuda")
 
@@ -98,7 +98,7 @@ lms = LMSDiscreteScheduler(
 )
 
 pipe = StableDiffusionPipeline.from_pretrained(
-    "CompVis/stable-diffusion-v1-4", 
+    "runwayml/stable-diffusion-v1-5", 
     scheduler=lms,
 ).to("cuda")
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
index 18c008f8806b..e4f56d94dac8 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
@@ -45,7 +45,7 @@ class FlaxStableDiffusionPipeline(FlaxDiffusionPipeline):
             [`FlaxDDIMScheduler`], [`FlaxLMSDiscreteScheduler`], or [`FlaxPNDMScheduler`].
         safety_checker ([`FlaxStableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
index 89cc054a424e..ce3f3fbacbc7 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -50,7 +50,7 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
index 30f8d7fcc3b8..b45d968f66e3 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -63,7 +63,7 @@ class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline):
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 8ae51999a7b3..46def26e94cc 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -40,7 +40,7 @@ class StableDiffusionPipeline(DiffusionPipeline):
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 799fd459bbd9..db8134834564 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -52,7 +52,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index abcc7fba6e8a..918a1241f996 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -59,7 +59,7 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 038178dd9a0a..71a2be62128c 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -66,7 +66,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """

From 2fb8fafa4b761f6fc144cf75a6f6f0ea6af3a1c1 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathan@huggingface.co>
Date: Mon, 24 Oct 2022 14:20:08 -0700
Subject: [PATCH 035/269] add community pipeline docs; add minimal text to some
 empty doc pages (#930)

* add community pipeline docs

* fix style in code snippets (lol)

* clean up loading docs

* add license to doc files

* fix some weird links
---
 docs/source/_toctree.yml                      |  12 +-
 docs/source/api/pipelines/ddim.mdx            |  12 +
 docs/source/api/pipelines/ddpm.mdx            |  12 +
 .../source/api/pipelines/latent_diffusion.mdx |  12 +
 .../api/pipelines/latent_diffusion_uncond.mdx |  12 +
 docs/source/api/pipelines/pndm.mdx            |  12 +
 docs/source/api/pipelines/score_sde_ve.mdx    |  12 +
 .../source/api/pipelines/stable_diffusion.mdx |  12 +
 .../api/pipelines/stochastic_karras_ve.mdx    |  12 +
 docs/source/using-diffusers/configuration.mdx |  19 +-
 .../using-diffusers/contribute_pipeline.mdx   | 169 +++++++++++
 docs/source/using-diffusers/custom.mdx        |  15 -
 .../custom_pipeline_examples.mdx              | 283 ++++++++++++++++++
 ...lines.mdx => custom_pipeline_overview.mdx} |   2 +-
 docs/source/using-diffusers/loading.mdx       |  23 +-
 src/diffusers/configuration_utils.py          |   2 +-
 16 files changed, 582 insertions(+), 39 deletions(-)
 create mode 100644 docs/source/using-diffusers/contribute_pipeline.mdx
 delete mode 100644 docs/source/using-diffusers/custom.mdx
 create mode 100644 docs/source/using-diffusers/custom_pipeline_examples.mdx
 rename docs/source/using-diffusers/{custom_pipelines.mdx => custom_pipeline_overview.mdx} (99%)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 390b8ff042db..a0358c32e4ec 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -12,9 +12,9 @@
       title: "Loading Pipelines, Models, and Schedulers"
     - local: using-diffusers/configuration
       title: "Configuring Pipelines, Models, and Schedulers"
-    - local: using-diffusers/custom_pipelines
-      title: "Loading and Creating Custom Pipelines"
-    title: "Loading"
+    - local: using-diffusers/custom_pipeline_overview
+      title: "Loading and Adding Custom Pipelines"
+    title: "Loading & Hub"
   - sections:
     - local: using-diffusers/unconditional_image_generation
       title: "Unconditional Image Generation"
@@ -24,8 +24,10 @@
       title: "Text-Guided Image-to-Image"
     - local: using-diffusers/inpaint
       title: "Text-Guided Image-Inpainting"
-    - local: using-diffusers/custom
-      title: "Create a custom pipeline"
+    - local: using-diffusers/custom_pipeline_examples
+      title: "Community Pipelines"
+    - local: using-diffusers/contribute_pipeline
+      title: "How to contribute a Pipeline"
     title: "Pipelines for Inference"
   title: "Using Diffusers"
 - sections:
diff --git a/docs/source/api/pipelines/ddim.mdx b/docs/source/api/pipelines/ddim.mdx
index 41952c2c0dfe..2bb9def76ed8 100644
--- a/docs/source/api/pipelines/ddim.mdx
+++ b/docs/source/api/pipelines/ddim.mdx
@@ -1,3 +1,15 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
 # DDIM
 
 ## Overview
diff --git a/docs/source/api/pipelines/ddpm.mdx b/docs/source/api/pipelines/ddpm.mdx
index b0e08f84ef74..c6d8a6f28660 100644
--- a/docs/source/api/pipelines/ddpm.mdx
+++ b/docs/source/api/pipelines/ddpm.mdx
@@ -1,3 +1,15 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
 # DDPM
 
 ## Overview
diff --git a/docs/source/api/pipelines/latent_diffusion.mdx b/docs/source/api/pipelines/latent_diffusion.mdx
index 821ad4fea5b2..6d63cd5cbe5b 100644
--- a/docs/source/api/pipelines/latent_diffusion.mdx
+++ b/docs/source/api/pipelines/latent_diffusion.mdx
@@ -1,3 +1,15 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
 # Latent Diffusion
 
 ## Overview
diff --git a/docs/source/api/pipelines/latent_diffusion_uncond.mdx b/docs/source/api/pipelines/latent_diffusion_uncond.mdx
index 4e12fa7f525c..0a5b20cd4a1c 100644
--- a/docs/source/api/pipelines/latent_diffusion_uncond.mdx
+++ b/docs/source/api/pipelines/latent_diffusion_uncond.mdx
@@ -1,3 +1,15 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
 # Unconditional Latent Diffusion
 
 ## Overview
diff --git a/docs/source/api/pipelines/pndm.mdx b/docs/source/api/pipelines/pndm.mdx
index cbb7c5a929c7..89930f4d4f8f 100644
--- a/docs/source/api/pipelines/pndm.mdx
+++ b/docs/source/api/pipelines/pndm.mdx
@@ -1,3 +1,15 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
 # PNDM
 
 ## Overview
diff --git a/docs/source/api/pipelines/score_sde_ve.mdx b/docs/source/api/pipelines/score_sde_ve.mdx
index 2e555914ac9b..3d6619c2591f 100644
--- a/docs/source/api/pipelines/score_sde_ve.mdx
+++ b/docs/source/api/pipelines/score_sde_ve.mdx
@@ -1,3 +1,15 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
 # Score SDE VE
 
 ## Overview
diff --git a/docs/source/api/pipelines/stable_diffusion.mdx b/docs/source/api/pipelines/stable_diffusion.mdx
index 8fb0e470b479..46ffc3412e36 100644
--- a/docs/source/api/pipelines/stable_diffusion.mdx
+++ b/docs/source/api/pipelines/stable_diffusion.mdx
@@ -1,3 +1,15 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
 # Stable diffusion pipelines
 
 Stable Diffusion is a text-to-image _latent diffusion_ model created by the researchers and engineers from [CompVis](https://github.com/CompVis), [Stability AI](https://stability.ai/) and [LAION](https://laion.ai/). It's trained on 512x512 images from a subset of the [LAION-5B](https://laion.ai/blog/laion-5b/) dataset. This model uses a frozen CLIP ViT-L/14 text encoder to condition the model on text prompts. With its 860M UNet and 123M text encoder, the model is relatively lightweight and can run on consumer GPUs.
diff --git a/docs/source/api/pipelines/stochastic_karras_ve.mdx b/docs/source/api/pipelines/stochastic_karras_ve.mdx
index d1ef8f7b3f6c..de762cbda002 100644
--- a/docs/source/api/pipelines/stochastic_karras_ve.mdx
+++ b/docs/source/api/pipelines/stochastic_karras_ve.mdx
@@ -1,3 +1,15 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
 # Stochastic Karras VE
 
 ## Overview
diff --git a/docs/source/using-diffusers/configuration.mdx b/docs/source/using-diffusers/configuration.mdx
index a64cce0606be..36a2ad0d0394 100644
--- a/docs/source/using-diffusers/configuration.mdx
+++ b/docs/source/using-diffusers/configuration.mdx
@@ -12,21 +12,10 @@ specific language governing permissions and limitations under the License.
 
 
 
-# Quicktour
+# Configuration
 
-Start using Diffusers🧨 quickly!
-To start, use the [`DiffusionPipeline`] for quick inference and sample generations!
-
-```
-pip install diffusers
-```
-
-## Main classes
-
-### Models
-
-### Schedulers
-
-### Pipelines
+The handling of configurations in Diffusers is with the `ConfigMixin` class.
 
+[[autodoc]] ConfigMixin
 
+Under further construction 🚧, open a [PR](https://github.com/huggingface/diffusers/compare) if you want to contribute!
diff --git a/docs/source/using-diffusers/contribute_pipeline.mdx b/docs/source/using-diffusers/contribute_pipeline.mdx
new file mode 100644
index 000000000000..6122a996e1c1
--- /dev/null
+++ b/docs/source/using-diffusers/contribute_pipeline.mdx
@@ -0,0 +1,169 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# How to build a community pipeline
+
+*Note*: this page was built from the GitHub Issue on Community Pipelines [#841](https://github.com/huggingface/diffusers/issues/841).
+
+Let's make an example!
+Say you want to define a pipeline that just does a single forward pass to a U-Net and then calls a scheduler only once (Note, this doesn't make any sense from a scientific point of view, but only represents an example of how things work under the hood).
+
+Cool! So you open your favorite IDE and start creating your pipeline 💻.
+First, what model weights and configurations do we need?
+We have a U-Net and a scheduler, so our pipeline should take a U-Net and a scheduler as an argument.
+Also, as stated above, you'd like to be able to load weights and the scheduler config for Hub and share your code with others, so we'll inherit from `DiffusionPipeline`:
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+
+class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
+    def __init__(self, unet, scheduler):
+        super().__init__()
+```
+
+Now, we must save the `unet` and `scheduler` in a config file so that you can save your pipeline with `save_pretrained`.
+Therefore, make sure you add every component that is save-able to the `register_modules` function:
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+
+class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
+    def __init__(self, unet, scheduler):
+        super().__init__()
+
+        self.register_modules(unet=unet, scheduler=scheduler)
+```
+
+Cool, the init is done! 🔥 Now, let's go into the forward pass, which we recommend defining as `__call__` . Here you're given all the creative freedom there is. For our amazing "one-step" pipeline, we simply create a random image and call the unet once and the scheduler once:
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+
+class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
+    def __init__(self, unet, scheduler):
+        super().__init__()
+
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    def __call__(self):
+        image = torch.randn(
+            (1, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
+        )
+        timestep = 1
+
+        model_output = self.unet(image, timestep).sample
+        scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
+
+        return scheduler_output
+```
+
+Cool, that's it! 🚀 You can now run this pipeline by passing a `unet` and a `scheduler` to the init:
+
+```python
+from diffusers import DDPMScheduler, Unet2DModel
+
+scheduler = DDPMScheduler()
+unet = UNet2DModel()
+
+pipeline = UnetSchedulerOneForwardPipeline(unet=unet, scheduler=scheduler)
+
+output = pipeline()
+```
+
+But what's even better is that you can load pre-existing weights into the pipeline if they match exactly your pipeline structure. This is e.g. the case for [https://huggingface.co/google/ddpm-cifar10-32](https://huggingface.co/google/ddpm-cifar10-32) so that we can do the following:
+
+```python
+pipeline = UnetSchedulerOneForwardPipeline.from_pretrained("google/ddpm-cifar10-32")
+
+output = pipeline()
+```
+
+We want to share this amazing pipeline with the community, so we would open a PR request to add the following code under `one_step_unet.py` to [https://github.com/huggingface/diffusers/tree/main/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) .
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+
+class UnetSchedulerOneForwardPipeline(DiffusionPipeline):
+    def __init__(self, unet, scheduler):
+        super().__init__()
+
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    def __call__(self):
+        image = torch.randn(
+            (1, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
+        )
+        timestep = 1
+
+        model_output = self.unet(image, timestep).sample
+        scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample
+
+        return scheduler_output
+```
+
+Our amazing pipeline got merged here: [#840](https://github.com/huggingface/diffusers/pull/840).
+Now everybody that has `diffusers >= 0.4.0` installed can use our pipeline magically 🪄 as follows:
+
+```python
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="one_step_unet")
+pipe()
+```
+
+Another way to upload your custom_pipeline, besides sending a PR, is uploading the code that contains it to the Hugging Face Hub, [as exemplified here](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipelines#loading-custom-pipelines-from-the-hub).
+
+**Try it out now - it works!**
+
+In general, you will want to create much more sophisticated pipelines, so we recommend looking at existing pipelines here: [https://github.com/huggingface/diffusers/tree/main/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community).
+
+IMPORTANT:
+You can use whatever package you want in your community pipeline file - as long as the user has it installed, everything will work fine. Make sure you have one and only one pipeline class that inherits from `DiffusionPipeline` as this will be automatically detected.
+
+## How do community pipelines work?
+A community pipeline is a class that has to inherit from ['DiffusionPipeline']:
+and that has been added to `examples/community` [files](https://github.com/huggingface/diffusers/tree/main/examples/community).
+The community can load the pipeline code via the custom_pipeline argument from DiffusionPipeline. See docs [here](https://huggingface.co/docs/diffusers/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained.custom_pipeline):
+
+This means:
+The model weights and configs of the pipeline should be loaded from the `pretrained_model_name_or_path` [argument](https://huggingface.co/docs/diffusers/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path):
+whereas the code that powers the community pipeline is defined in a file added in [`examples/community`](https://github.com/huggingface/diffusers/tree/main/examples/community).
+
+Now, it might very well be that only some of your pipeline components weights can be downloaded from an official repo.
+The other components should then be passed directly to init as is the case for the ClIP guidance notebook [here](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/CLIP_Guided_Stable_diffusion_with_diffusers.ipynb#scrollTo=z9Kglma6hjki).
+
+The magic behind all of this is that we load the code directly from GitHub. You can check it out in more detail if you follow the functionality defined here:
+
+```python
+# 2. Load the pipeline class, if using custom module then load it from the hub
+# if we load from explicit class, let's use it
+if custom_pipeline is not None:
+    pipeline_class = get_class_from_dynamic_module(
+        custom_pipeline, module_file=CUSTOM_PIPELINE_FILE_NAME, cache_dir=custom_pipeline
+    )
+elif cls != DiffusionPipeline:
+    pipeline_class = cls
+else:
+    diffusers_module = importlib.import_module(cls.__module__.split(".")[0])
+    pipeline_class = getattr(diffusers_module, config_dict["_class_name"])
+```
+
+This is why a community pipeline merged to GitHub will be directly available to all `diffusers` packages.
+
diff --git a/docs/source/using-diffusers/custom.mdx b/docs/source/using-diffusers/custom.mdx
deleted file mode 100644
index 2d17adaff711..000000000000
--- a/docs/source/using-diffusers/custom.mdx
+++ /dev/null
@@ -1,15 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
--->
-
-# Custom Pipeline
-
-Under construction 🚧
diff --git a/docs/source/using-diffusers/custom_pipeline_examples.mdx b/docs/source/using-diffusers/custom_pipeline_examples.mdx
new file mode 100644
index 000000000000..b77e33be77d0
--- /dev/null
+++ b/docs/source/using-diffusers/custom_pipeline_examples.mdx
@@ -0,0 +1,283 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Custom Pipelines
+
+> **For more information about community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).**
+
+**Community** examples consist of both inference and training examples that have been added by the community.
+Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste ready code example that you can try out.
+If a community doesn't work as expected, please open an issue and ping the author on it.
+
+| Example                                | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | Code Example                                                      | Colab                                                                                                                                                                                                              |                                                     Author |
+|:---------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------:|
+| CLIP Guided Stable Diffusion           | Doing CLIP guidance for text to image generation with Stable Diffusion                                                                                                                                                                                                                                                                                                                                                                                                                                   | [CLIP Guided Stable Diffusion](#clip-guided-stable-diffusion)     | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/CLIP_Guided_Stable_diffusion_with_diffusers.ipynb) |             [Suraj Patil](https://github.com/patil-suraj/) |
+| One Step U-Net (Dummy)                 | Example showcasing of how to use Community Pipelines (see https://github.com/huggingface/diffusers/issues/841)                                                                                                                                                                                                                                                                                                                                                                                           | [One Step U-Net](#one-step-unet)                                  | -                                                                                                                                                                                                                  | [Patrick von Platen](https://github.com/patrickvonplaten/) |
+| Stable Diffusion Interpolation         | Interpolate the latent space of Stable Diffusion between different prompts/seeds                                                                                                                                                                                                                                                                                                                                                                                                                         | [Stable Diffusion Interpolation](#stable-diffusion-interpolation) | -                                                                                                                                                                                                                  |                    [Nate Raw](https://github.com/nateraw/) |
+| Stable Diffusion Mega                  | **One** Stable Diffusion Pipeline with all functionalities of [Text2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py), [Image2Image](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) and [Inpainting](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | [Stable Diffusion Mega](#stable-diffusion-mega)                   | -                                                                                                                                                                                                                  | [Patrick von Platen](https://github.com/patrickvonplaten/) |
+| Long Prompt Weighting Stable Diffusion | **One** Stable Diffusion Pipeline without tokens length limit, and support parsing weighting in prompt.                                                                                                                                                                                                                                                                                                                                                                                                  | [Long Prompt Weighting Stable Diffusion](#long-prompt-weighting-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [SkyTNT](https://github.com/SkyTNT) |
+| Speech to Image                        | Using automatic-speech-recognition to transcribe text and Stable Diffusion to generate images                                                                                                                                                                                                                                                                                                                                                                                                            | [Speech to Image](#speech-to-image)                               | -                                                                                                                                                                                                                  | [Mikail Duzenli](https://github.com/MikailINTech)
+
+To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
+```py
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4", custom_pipeline="filename_in_the_community_folder"
+)
+```
+
+## Example usages
+
+### CLIP Guided Stable Diffusion
+
+CLIP guided stable diffusion can help to generate more realistic images
+by guiding stable diffusion at every denoising step with an additional CLIP model.
+
+The following code requires roughly 12GB of GPU RAM.
+
+```python
+from diffusers import DiffusionPipeline
+from transformers import CLIPFeatureExtractor, CLIPModel
+import torch
+
+
+feature_extractor = CLIPFeatureExtractor.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")
+clip_model = CLIPModel.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K", torch_dtype=torch.float16)
+
+
+guided_pipeline = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="clip_guided_stable_diffusion",
+    clip_model=clip_model,
+    feature_extractor=feature_extractor,
+    revision="fp16",
+    torch_dtype=torch.float16,
+)
+guided_pipeline.enable_attention_slicing()
+guided_pipeline = guided_pipeline.to("cuda")
+
+prompt = "fantasy book cover, full moon, fantasy forest landscape, golden vector elements, fantasy magic, dark light night, intricate, elegant, sharp focus, illustration, highly detailed, digital painting, concept art, matte, art by WLOP and Artgerm and Albert Bierstadt, masterpiece"
+
+generator = torch.Generator(device="cuda").manual_seed(0)
+images = []
+for i in range(4):
+    image = guided_pipeline(
+        prompt,
+        num_inference_steps=50,
+        guidance_scale=7.5,
+        clip_guidance_scale=100,
+        num_cutouts=4,
+        use_cutouts=False,
+        generator=generator,
+    ).images[0]
+    images.append(image)
+
+# save images locally
+for i, img in enumerate(images):
+    img.save(f"./clip_guided_sd/image_{i}.png")
+```
+
+The `images` list contains a list of PIL images that can be saved locally or displayed directly in a google colab.
+Generated images tend to be of higher qualtiy than natively using stable diffusion. E.g. the above script generates the following images:
+
+![clip_guidance](https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/clip_guidance/merged_clip_guidance.jpg).
+
+### One Step Unet
+
+The dummy "one-step-unet" can be run as follows:
+
+```python
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="one_step_unet")
+pipe()
+```
+
+**Note**: This community pipeline is not useful as a feature, but rather just serves as an example of how community pipelines can be added (see https://github.com/huggingface/diffusers/issues/841).
+
+### Stable Diffusion Interpolation
+
+The following code can be run on a GPU of at least 8GB VRAM and should take approximately 5 minutes.
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    revision="fp16",
+    torch_dtype=torch.float16,
+    safety_checker=None,  # Very important for videos...lots of false positives while interpolating
+    custom_pipeline="interpolate_stable_diffusion",
+).to("cuda")
+pipe.enable_attention_slicing()
+
+frame_filepaths = pipe.walk(
+    prompts=["a dog", "a cat", "a horse"],
+    seeds=[42, 1337, 1234],
+    num_interpolation_steps=16,
+    output_dir="./dreams",
+    batch_size=4,
+    height=512,
+    width=512,
+    guidance_scale=8.5,
+    num_inference_steps=50,
+)
+```
+
+The output of the `walk(...)` function returns a list of images saved under the folder as defined in `output_dir`. You can use these images to create videos of stable diffusion.
+
+> **Please have a look at https://github.com/nateraw/stable-diffusion-videos for more in-detail information on how to create videos using stable diffusion as well as more feature-complete functionality.**
+
+### Stable Diffusion Mega
+
+The Stable Diffusion Mega Pipeline lets you use the main use cases of the stable diffusion pipeline in a single class.
+
+```python
+#!/usr/bin/env python3
+from diffusers import DiffusionPipeline
+import PIL
+import requests
+from io import BytesIO
+import torch
+
+
+def download_image(url):
+    response = requests.get(url)
+    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="stable_diffusion_mega",
+    torch_dtype=torch.float16,
+    revision="fp16",
+)
+pipe.to("cuda")
+pipe.enable_attention_slicing()
+
+
+### Text-to-Image
+
+images = pipe.text2img("An astronaut riding a horse").images
+
+### Image-to-Image
+
+init_image = download_image(
+    "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+)
+
+prompt = "A fantasy landscape, trending on artstation"
+
+images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
+
+### Inpainting
+
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+init_image = download_image(img_url).resize((512, 512))
+mask_image = download_image(mask_url).resize((512, 512))
+
+prompt = "a cat sitting on a bench"
+images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
+```
+
+As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.
+
+### Long Prompt Weighting Stable Diffusion
+
+The Pipeline lets you input prompt without 77 token length limit. And you can increase words weighting by using "()" or decrease words weighting by using "[]"
+The Pipeline also lets you use the main use cases of the stable diffusion pipeline in a single class.
+
+#### pytorch
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipe = DiffusionPipeline.from_pretrained(
+    "hakurei/waifu-diffusion", custom_pipeline="lpw_stable_diffusion", revision="fp16", torch_dtype=torch.float16
+)
+pipe = pipe.to("cuda")
+
+prompt = "best_quality (1girl:1.3) bow bride brown_hair closed_mouth frilled_bow frilled_hair_tubes frills (full_body:1.3) fox_ear hair_bow hair_tubes happy hood japanese_clothes kimono long_sleeves red_bow smile solo tabi uchikake white_kimono wide_sleeves cherry_blossoms"
+neg_prompt = "lowres, bad_anatomy, error_body, error_hair, error_arm, error_hands, bad_hands, error_fingers, bad_fingers, missing_fingers, error_legs, bad_legs, multiple_legs, missing_legs, error_lighting, error_shadow, error_reflection, text, error, extra_digit, fewer_digits, cropped, worst_quality, low_quality, normal_quality, jpeg_artifacts, signature, watermark, username, blurry"
+
+pipe.text2img(prompt, negative_prompt=neg_prompt, width=512, height=512, max_embeddings_multiples=3).images[0]
+```
+
+#### onnxruntime
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="lpw_stable_diffusion_onnx",
+    revision="onnx",
+    provider="CUDAExecutionProvider",
+)
+
+prompt = "a photo of an astronaut riding a horse on mars, best quality"
+neg_prompt = "lowres, bad anatomy, error body, error hair, error arm, error hands, bad hands, error fingers, bad fingers, missing fingers, error legs, bad legs, multiple legs, missing legs, error lighting, error shadow, error reflection, text, error, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry"
+
+pipe.text2img(prompt, negative_prompt=neg_prompt, width=512, height=512, max_embeddings_multiples=3).images[0]
+```
+
+if you see `Token indices sequence length is longer than the specified maximum sequence length for this model ( *** > 77 ) . Running this sequence through the model will result in indexing errors`. Do not worry, it is normal.
+
+### Speech to Image
+
+The following code can generate an image from an audio sample using pre-trained OpenAI whisper-small and Stable Diffusion.
+
+```Python
+import torch
+
+import matplotlib.pyplot as plt
+from datasets import load_dataset
+from diffusers import DiffusionPipeline
+from transformers import (
+    WhisperForConditionalGeneration,
+    WhisperProcessor,
+)
+
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+audio_sample = ds[3]
+
+text = audio_sample["text"].lower()
+speech_data = audio_sample["audio"]["array"]
+
+model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
+processor = WhisperProcessor.from_pretrained("openai/whisper-small")
+
+diffuser_pipeline = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="speech_to_image_diffusion",
+    speech_model=model,
+    speech_processor=processor,
+    revision="fp16",
+    torch_dtype=torch.float16,
+)
+
+diffuser_pipeline.enable_attention_slicing()
+diffuser_pipeline = diffuser_pipeline.to(device)
+
+output = diffuser_pipeline(speech_data)
+plt.imshow(output.images[0])
+```
+This example produces the following image:
+
+![image](https://user-images.githubusercontent.com/45072645/196901736-77d9c6fc-63ee-4072-90b0-dc8b903d63e3.png)
\ No newline at end of file
diff --git a/docs/source/using-diffusers/custom_pipelines.mdx b/docs/source/using-diffusers/custom_pipeline_overview.mdx
similarity index 99%
rename from docs/source/using-diffusers/custom_pipelines.mdx
rename to docs/source/using-diffusers/custom_pipeline_overview.mdx
index af973660a86e..b1e9c0ce6178 100644
--- a/docs/source/using-diffusers/custom_pipelines.mdx
+++ b/docs/source/using-diffusers/custom_pipeline_overview.mdx
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# Custom Pipelines
+# Loading and Saving Custom Pipelines
 
 Diffusers allows you to conveniently load any custom pipeline from the Hugging Face Hub as well as any [official community pipeline](https://github.com/huggingface/diffusers/tree/main/examples/community) 
 via the [`DiffusionPipeline`] class.
diff --git a/docs/source/using-diffusers/loading.mdx b/docs/source/using-diffusers/loading.mdx
index 44b514bbb299..35f53b566442 100644
--- a/docs/source/using-diffusers/loading.mdx
+++ b/docs/source/using-diffusers/loading.mdx
@@ -10,6 +10,25 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# Loading 
+# Loading
 
-Under construction 🚧
+The core functionality for saving and loading systems in `Diffusers` is the HuggingFace Hub.
+
+[[autodoc]] modeling_utils.ModelMixin
+    - from_pretrained
+    - save_pretrained
+
+[[autodoc]] pipeline_utils.DiffusionPipeline
+    - from_pretrained
+    - save_pretrained
+
+[[autodoc]] modeling_flax_utils.FlaxModelMixin
+    - from_pretrained
+    - save_pretrained
+
+[[autodoc]] pipeline_flax_utils.FlaxDiffusionPipeline
+    - from_pretrained
+    - save_pretrained
+
+
+Under further construction 🚧, open a [PR](https://github.com/huggingface/diffusers/compare) if you want to contribute!
diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py
index 6f866c6b5fff..6df103823086 100644
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ConfigMixinuration base class and utilities."""
+""" ConfigMixin base class and utilities."""
 import dataclasses
 import functools
 import inspect

From 82044153df266fbe49a261de56e76d85dca26304 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Tue, 25 Oct 2022 13:05:44 +0200
Subject: [PATCH 036/269] Fix typo: `torch_type` -> `torch_dtype` (#972)

Fix typo: torch_type -> torch_dtype
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1dc1d7838a5d..a5f47dfcb032 100644
--- a/README.md
+++ b/README.md
@@ -90,7 +90,7 @@ precision while being roughly twice as fast and requiring half the amount of GPU
 ```python
 from diffusers import StableDiffusionPipeline
 
-pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_type=torch.float16, revision="fp16")
+pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, revision="fp16")
 pipe = pipe.to("cuda")
 
 prompt = "a photo of an astronaut riding a horse on mars"

From 6e099e2c8ce4c4f5c7318e970a8c093dc5c7046e Mon Sep 17 00:00:00 2001
From: Tanishq Abraham <37097934+tmabraham@users.noreply.github.com>
Date: Tue, 25 Oct 2022 04:08:56 -0700
Subject: [PATCH 037/269] add num_inference_steps arg to DDPM (#935)

---
 src/diffusers/pipelines/ddpm/pipeline_ddpm.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
index 5842a6d297bb..85db691dc657 100644
--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -42,6 +42,7 @@ def __call__(
         self,
         batch_size: int = 1,
         generator: Optional[torch.Generator] = None,
+        num_inference_steps: int = 1000,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         **kwargs,
@@ -53,6 +54,9 @@ def __call__(
             generator (`torch.Generator`, *optional*):
                 A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
                 deterministic.
+            num_inference_steps (`int`, *optional*, defaults to 1000):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generate image. Choose between
                 [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -73,7 +77,7 @@ def __call__(
         image = image.to(self.device)
 
         # set step values
-        self.scheduler.set_timesteps(1000)
+        self.scheduler.set_timesteps(num_inference_steps)
 
         for t in self.progress_bar(self.scheduler.timesteps):
             # 1. predict noise model_output

From 38ae5a25da17015729d185db6e58b7baa0e095d0 Mon Sep 17 00:00:00 2001
From: MarkRich <mark.rich388@gmail.com>
Date: Tue, 25 Oct 2022 04:30:27 -0700
Subject: [PATCH 038/269] Add Composable diffusion to community pipeline
 examples  (#951)

* Initial composable diffusion pipeline

* add composable stable diffusion to readme table

* Update examples/community/README.md

* Apply suggestions from code review

* Update examples/community/README.md

* Update examples/community/README.md

* Update examples/community/README.md

* up

* Update examples/community/README.md

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 examples/community/README.md                  |  49 +++
 .../community/composable_stable_diffusion.py  | 329 ++++++++++++++++++
 2 files changed, 378 insertions(+)
 create mode 100644 examples/community/composable_stable_diffusion.py

diff --git a/examples/community/README.md b/examples/community/README.md
index 7ecaffb33114..8f70e004e5fd 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -15,6 +15,8 @@ If a community doesn't work as expected, please open an issue and ping the autho
 | Long Prompt Weighting Stable Diffusion | **One** Stable Diffusion Pipeline without tokens length limit, and support parsing weighting in prompt.                                                                                                                                                                                                                                                                                                                                                                                                  | [Long Prompt Weighting Stable Diffusion](#long-prompt-weighting-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [SkyTNT](https://github.com/SkyTNT) |
 | Speech to Image                        | Using automatic-speech-recognition to transcribe text and Stable Diffusion to generate images                                                                                                                                                                                                                                                                                                                                                                                                            | [Speech to Image](#speech-to-image)                               | -                                                                                                                                                                                                                  | [Mikail Duzenli](https://github.com/MikailINTech)
 | Wild Card Stable Diffusion | Stable Diffusion Pipeline that supports prompts that contain wildcard terms (indicated by surrounding double underscores), with values instantiated randomly from a corresponding txt file or a dictionary of possible values                                                                                                                                                                                                                                                                                                     | [Wildcard Stable Diffusion](#wildcard-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Shyam Sudhakaran](https://github.com/shyamsn97) |
+| Composable Stable Diffusion| Stable Diffusion Pipeline that supports prompts that contain "&#124;" in prompts (as an AND condition) and weights (separated by "&#124;" as well) to positively / negatively weight prompts.                                                                                                                                                                                                                                                                                                     | [Composable Stable Diffusion](#composable-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
+
 
 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
 ```py
@@ -322,3 +324,50 @@ out = pipe(
     wildcard_files=["object.txt", "animal.txt"],
     num_prompt_samples=1
 )
+```
+
+
+### Composable Stable diffusion 
+
+```python
+import torch as th
+import numpy as np
+import torchvision.utils as tvu
+from diffusers import DiffusionPipeline
+
+has_cuda = th.cuda.is_available()
+device = th.device('cpu' if not has_cuda else 'cuda')
+
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    use_auth_token=True,
+    custom_pipeline="composable_stable_diffusion",
+).to(device)
+
+
+def dummy(images, **kwargs):
+    return images, False
+
+pipe.safety_checker = dummy
+
+images = []
+generator = th.Generator("cuda").manual_seed(0)
+
+seed = 0
+prompt = "a forest | a camel"
+weights = " 1 | 1"  # Equal weight to each prompt. Cna be negative
+
+images = []
+for i in range(4):
+    res = pipe(
+        prompt,
+        guidance_scale=7.5,
+        num_inference_steps=50,
+        weights=weights,
+        generator=generator)
+    image = res.images[0]
+    images.append(image)
+
+for i, img in enumerate(images):
+    img.save(f"./composable_diffusion/image_{i}.png")
+```
diff --git a/examples/community/composable_stable_diffusion.py b/examples/community/composable_stable_diffusion.py
new file mode 100644
index 000000000000..308c18287e02
--- /dev/null
+++ b/examples/community/composable_stable_diffusion.py
@@ -0,0 +1,329 @@
+"""
+    modified based on diffusion library from Huggingface: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+"""
+import inspect
+import warnings
+from typing import List, Optional, Union
+
+import torch
+
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+
+class ComposableStableDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offsensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        weights: Optional[str] = "",
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `nd.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        if "torch_device" in kwargs:
+            device = kwargs.pop("torch_device")
+            warnings.warn(
+                "`torch_device` is deprecated as an input argument to `__call__` and will be removed in v0.3.0."
+                " Consider using `pipe.to(torch_device)` instead."
+            )
+
+            # Set device as before (to be removed in 0.3.0)
+            if device is None:
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+            self.to(device)
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if "|" in prompt:
+            prompt = [x.strip() for x in prompt.split("|")]
+            print(f"composing {prompt}...")
+
+        # get prompt text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
+
+        if not weights:
+            # specify weights for prompts (excluding the unconditional score)
+            print("using equal weights for all prompts...")
+            pos_weights = torch.tensor(
+                [1 / (text_embeddings.shape[0] - 1)] * (text_embeddings.shape[0] - 1), device=self.device
+            ).reshape(-1, 1, 1, 1)
+            neg_weights = torch.tensor([1.0], device=self.device).reshape(-1, 1, 1, 1)
+            mask = torch.tensor([False] + [True] * pos_weights.shape[0], dtype=torch.bool)
+        else:
+            # set prompt weight for each
+            num_prompts = len(prompt) if isinstance(prompt, list) else 1
+            weights = [float(w.strip()) for w in weights.split("|")]
+            if len(weights) < num_prompts:
+                weights.append(1.0)
+            weights = torch.tensor(weights, device=self.device)
+            assert len(weights) == text_embeddings.shape[0], "weights specified are not equal to the number of prompts"
+            pos_weights = []
+            neg_weights = []
+            mask = []  # first one is unconditional score
+            for w in weights:
+                if w > 0:
+                    pos_weights.append(w)
+                    mask.append(True)
+                else:
+                    neg_weights.append(abs(w))
+                    mask.append(False)
+            # normalize the weights
+            pos_weights = torch.tensor(pos_weights, device=self.device).reshape(-1, 1, 1, 1)
+            pos_weights = pos_weights / pos_weights.sum()
+            neg_weights = torch.tensor(neg_weights, device=self.device).reshape(-1, 1, 1, 1)
+            neg_weights = neg_weights / neg_weights.sum()
+            mask = torch.tensor(mask, device=self.device, dtype=torch.bool)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            max_length = text_input.input_ids.shape[-1]
+
+            if torch.all(mask):
+                # no negative prompts, so we use empty string as the negative prompt
+                uncond_input = self.tokenizer(
+                    [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
+                )
+                uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+                # For classifier free guidance, we need to do two forward passes.
+                # Here we concatenate the unconditional and text embeddings into a single batch
+                # to avoid doing two forward passes
+                text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+                # update negative weights
+                neg_weights = torch.tensor([1.0], device=self.device)
+                mask = torch.tensor([False] + mask.detach().tolist(), device=self.device, dtype=torch.bool)
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_device = "cpu" if self.device.type == "mps" else self.device
+        latents_shape = (batch_size, self.unet.in_channels, height // 8, width // 8)
+        if latents is None:
+            latents = torch.randn(
+                latents_shape,
+                generator=generator,
+                device=latents_device,
+            )
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+        latents = latents.to(self.device)
+
+        # set timesteps
+        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
+        extra_set_kwargs = {}
+        if accepts_offset:
+            extra_set_kwargs["offset"] = 1
+
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+
+        # if we use LMSDiscreteScheduler, let's make sure latents are mulitplied by sigmas
+        if isinstance(self.scheduler, LMSDiscreteScheduler):
+            latents = latents * self.scheduler.sigmas[0]
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = (
+                torch.cat([latents] * text_embeddings.shape[0]) if do_classifier_free_guidance else latents
+            )
+            if isinstance(self.scheduler, LMSDiscreteScheduler):
+                sigma = self.scheduler.sigmas[i]
+                # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
+                latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
+
+            # reduce memory by predicting each score sequentially
+            noise_preds = []
+            # predict the noise residual
+            for latent_in, text_embedding_in in zip(
+                torch.chunk(latent_model_input, chunks=latent_model_input.shape[0], dim=0),
+                torch.chunk(text_embeddings, chunks=text_embeddings.shape[0], dim=0),
+            ):
+                noise_preds.append(self.unet(latent_in, t, encoder_hidden_states=text_embedding_in).sample)
+            noise_preds = torch.cat(noise_preds, dim=0)
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond = (noise_preds[~mask] * neg_weights).sum(dim=0, keepdims=True)
+                noise_pred_text = (noise_preds[mask] * pos_weights).sum(dim=0, keepdims=True)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            if isinstance(self.scheduler, LMSDiscreteScheduler):
+                latents = self.scheduler.step(noise_pred, i, latents, **extra_step_kwargs).prev_sample
+            else:
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+        # scale and decode the image latents with vae
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        # run safety checker
+        safety_cheker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(self.device)
+        image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_cheker_input.pixel_values)
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

From 240abddfbce5e006997f2b53c1ff248e17b3b580 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 25 Oct 2022 13:43:24 +0200
Subject: [PATCH 039/269] [Flax] added broadcast_to_shape_from_left helper and
 Scheduler tests (#864)

* added broadcast_to_shape_from_left helper

* initial tests

* fixed pndm tests

* shape required for pndm

* added require_flax

* fix style

* fix more imports

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/schedulers/__init__.py          |   2 +-
 .../schedulers/scheduling_ddim_flax.py        |  15 +-
 .../schedulers/scheduling_ddpm_flax.py        |  23 +-
 .../schedulers/scheduling_karras_ve_flax.py   |  11 +-
 .../scheduling_lms_discrete_flax.py           |  15 +-
 .../schedulers/scheduling_pndm_flax.py        |  10 +-
 .../schedulers/scheduling_sde_ve_flax.py      |  26 +-
 .../schedulers/scheduling_utils_flax.py       |   6 +
 tests/test_scheduler_flax.py                  | 863 ++++++++++++++++++
 9 files changed, 931 insertions(+), 40 deletions(-)
 create mode 100644 tests/test_scheduler_flax.py

diff --git a/src/diffusers/schedulers/__init__.py b/src/diffusers/schedulers/__init__.py
index a906c39eb24c..91026b7d487f 100644
--- a/src/diffusers/schedulers/__init__.py
+++ b/src/diffusers/schedulers/__init__.py
@@ -34,7 +34,7 @@
     from .scheduling_lms_discrete_flax import FlaxLMSDiscreteScheduler
     from .scheduling_pndm_flax import FlaxPNDMScheduler
     from .scheduling_sde_ve_flax import FlaxScoreSdeVeScheduler
-    from .scheduling_utils_flax import FlaxSchedulerMixin
+    from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
 else:
     from ..utils.dummy_flax_objects import *  # noqa F403
 
diff --git a/src/diffusers/schedulers/scheduling_ddim_flax.py b/src/diffusers/schedulers/scheduling_ddim_flax.py
index 7726ef2eda0d..590e3aac2e49 100644
--- a/src/diffusers/schedulers/scheduling_ddim_flax.py
+++ b/src/diffusers/schedulers/scheduling_ddim_flax.py
@@ -23,7 +23,7 @@
 import jax.numpy as jnp
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput
+from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
 
 
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> jnp.ndarray:
@@ -173,7 +173,9 @@ def _get_variance(self, timestep, prev_timestep, alphas_cumprod):
 
         return variance
 
-    def set_timesteps(self, state: DDIMSchedulerState, num_inference_steps: int, shape: Tuple) -> DDIMSchedulerState:
+    def set_timesteps(
+        self, state: DDIMSchedulerState, num_inference_steps: int, shape: Tuple = ()
+    ) -> DDIMSchedulerState:
         """
         Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
 
@@ -211,9 +213,6 @@ def step(
             timestep (`int`): current discrete timestep in the diffusion chain.
             sample (`jnp.ndarray`):
                 current instance of sample being created by diffusion process.
-            key (`random.KeyArray`): a PRNG key.
-            eta (`float`): weight of noise for added noise in diffusion step.
-            use_clipped_model_output (`bool`): TODO
             return_dict (`bool`): option for returning tuple rather than FlaxDDIMSchedulerOutput class
 
         Returns:
@@ -279,13 +278,11 @@ def add_noise(
     ) -> jnp.ndarray:
         sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
         sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_alpha_prod = sqrt_alpha_prod[:, None]
+        sqrt_alpha_prod = broadcast_to_shape_from_left(sqrt_alpha_prod, original_samples.shape)
 
         sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.0
         sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod[:, None]
+        sqrt_one_minus_alpha_prod = broadcast_to_shape_from_left(sqrt_one_minus_alpha_prod, original_samples.shape)
 
         noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
         return noisy_samples
diff --git a/src/diffusers/schedulers/scheduling_ddpm_flax.py b/src/diffusers/schedulers/scheduling_ddpm_flax.py
index 7b3265611101..7220a0145471 100644
--- a/src/diffusers/schedulers/scheduling_ddpm_flax.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_flax.py
@@ -23,7 +23,7 @@
 from jax import random
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput
+from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
 
 
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> jnp.ndarray:
@@ -101,6 +101,10 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
 
     """
 
+    @property
+    def has_state(self):
+        return True
+
     @register_to_config
     def __init__(
         self,
@@ -129,11 +133,12 @@ def __init__(
         self.alphas_cumprod = jnp.cumprod(self.alphas, axis=0)
         self.one = jnp.array(1.0)
 
-        self.state = DDPMSchedulerState.create(num_train_timesteps=num_train_timesteps)
-
-        self.variance_type = variance_type
+    def create_state(self):
+        return DDPMSchedulerState.create(num_train_timesteps=self.config.num_train_timesteps)
 
-    def set_timesteps(self, state: DDPMSchedulerState, num_inference_steps: int, shape: Tuple) -> DDPMSchedulerState:
+    def set_timesteps(
+        self, state: DDPMSchedulerState, num_inference_steps: int, shape: Tuple = ()
+    ) -> DDPMSchedulerState:
         """
         Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
 
@@ -214,7 +219,7 @@ def step(
         """
         t = timestep
 
-        if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
+        if model_output.shape[1] == sample.shape[1] * 2 and self.config.variance_type in ["learned", "learned_range"]:
             model_output, predicted_variance = jnp.split(model_output, sample.shape[1], axis=1)
         else:
             predicted_variance = None
@@ -267,13 +272,11 @@ def add_noise(
     ) -> jnp.ndarray:
         sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
         sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_alpha_prod = sqrt_alpha_prod[..., None]
+        sqrt_alpha_prod = broadcast_to_shape_from_left(sqrt_alpha_prod, original_samples.shape)
 
         sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
         sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod[..., None]
+        sqrt_one_minus_alpha_prod = broadcast_to_shape_from_left(sqrt_one_minus_alpha_prod, original_samples.shape)
 
         noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
         return noisy_samples
diff --git a/src/diffusers/schedulers/scheduling_karras_ve_flax.py b/src/diffusers/schedulers/scheduling_karras_ve_flax.py
index caf27aa4c226..78ab007954aa 100644
--- a/src/diffusers/schedulers/scheduling_karras_ve_flax.py
+++ b/src/diffusers/schedulers/scheduling_karras_ve_flax.py
@@ -87,6 +87,10 @@ class FlaxKarrasVeScheduler(FlaxSchedulerMixin, ConfigMixin):
             A reasonable range is [0.2, 80].
     """
 
+    @property
+    def has_state(self):
+        return True
+
     @register_to_config
     def __init__(
         self,
@@ -97,10 +101,13 @@ def __init__(
         s_min: float = 0.05,
         s_max: float = 50,
     ):
-        self.state = KarrasVeSchedulerState.create()
+        pass
+
+    def create_state(self):
+        return KarrasVeSchedulerState.create()
 
     def set_timesteps(
-        self, state: KarrasVeSchedulerState, num_inference_steps: int, shape: Tuple
+        self, state: KarrasVeSchedulerState, num_inference_steps: int, shape: Tuple = ()
     ) -> KarrasVeSchedulerState:
         """
         Sets the continuous timesteps used for the diffusion chain. Supporting function to be run before inference.
diff --git a/src/diffusers/schedulers/scheduling_lms_discrete_flax.py b/src/diffusers/schedulers/scheduling_lms_discrete_flax.py
index cd71e1960e8c..20982d38aa40 100644
--- a/src/diffusers/schedulers/scheduling_lms_discrete_flax.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete_flax.py
@@ -20,7 +20,7 @@
 from scipy import integrate
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput
+from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
 
 
 @flax.struct.dataclass
@@ -63,6 +63,10 @@ class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
             option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
     """
 
+    @property
+    def has_state(self):
+        return True
+
     @register_to_config
     def __init__(
         self,
@@ -85,8 +89,10 @@ def __init__(
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = jnp.cumprod(self.alphas, axis=0)
 
+    def create_state(self):
         self.state = LMSDiscreteSchedulerState.create(
-            num_train_timesteps=num_train_timesteps, sigmas=((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5
+            num_train_timesteps=self.config.num_train_timesteps,
+            sigmas=((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5,
         )
 
     def get_lms_coefficient(self, state, order, t, current_order):
@@ -112,7 +118,7 @@ def lms_derivative(tau):
         return integrated_coeff
 
     def set_timesteps(
-        self, state: LMSDiscreteSchedulerState, num_inference_steps: int, shape: Tuple
+        self, state: LMSDiscreteSchedulerState, num_inference_steps: int, shape: Tuple = ()
     ) -> LMSDiscreteSchedulerState:
         """
         Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
@@ -199,8 +205,7 @@ def add_noise(
         timesteps: jnp.ndarray,
     ) -> jnp.ndarray:
         sigma = state.sigmas[timesteps].flatten()
-        while len(sigma.shape) < len(noise.shape):
-            sigma = sigma[..., None]
+        sigma = broadcast_to_shape_from_left(sigma, noise.shape)
 
         noisy_samples = original_samples + noise * sigma
 
diff --git a/src/diffusers/schedulers/scheduling_pndm_flax.py b/src/diffusers/schedulers/scheduling_pndm_flax.py
index 2e587d5e2121..357ecfe046c1 100644
--- a/src/diffusers/schedulers/scheduling_pndm_flax.py
+++ b/src/diffusers/schedulers/scheduling_pndm_flax.py
@@ -23,7 +23,7 @@
 import jax.numpy as jnp
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput
+from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
 
 
 def betas_for_alpha_bar(num_diffusion_timesteps: int, max_beta=0.999) -> jnp.ndarray:
@@ -168,6 +168,8 @@ def set_timesteps(self, state: PNDMSchedulerState, num_inference_steps: int, sha
                 the `FlaxPNDMScheduler` state data class instance.
             num_inference_steps (`int`):
                 the number of diffusion steps used when generating samples with a pre-trained model.
+            shape (`Tuple`):
+                the shape of the samples to be generated.
         """
         offset = self.config.steps_offset
 
@@ -509,13 +511,11 @@ def add_noise(
     ) -> jnp.ndarray:
         sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
         sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_alpha_prod = sqrt_alpha_prod[..., None]
+        sqrt_alpha_prod = broadcast_to_shape_from_left(sqrt_alpha_prod, original_samples.shape)
 
         sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
         sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod[..., None]
+        sqrt_one_minus_alpha_prod = broadcast_to_shape_from_left(sqrt_one_minus_alpha_prod, original_samples.shape)
 
         noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
         return noisy_samples
diff --git a/src/diffusers/schedulers/scheduling_sde_ve_flax.py b/src/diffusers/schedulers/scheduling_sde_ve_flax.py
index c4d802f83f94..d3eadede61b5 100644
--- a/src/diffusers/schedulers/scheduling_sde_ve_flax.py
+++ b/src/diffusers/schedulers/scheduling_sde_ve_flax.py
@@ -22,7 +22,7 @@
 from jax import random
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput
+from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
 
 
 @flax.struct.dataclass
@@ -80,6 +80,10 @@ class FlaxScoreSdeVeScheduler(FlaxSchedulerMixin, ConfigMixin):
         correct_steps (`int`): number of correction steps performed on a produced sample.
     """
 
+    @property
+    def has_state(self):
+        return True
+
     @register_to_config
     def __init__(
         self,
@@ -90,12 +94,20 @@ def __init__(
         sampling_eps: float = 1e-5,
         correct_steps: int = 1,
     ):
-        state = ScoreSdeVeSchedulerState.create()
+        pass
 
-        self.state = self.set_sigmas(state, num_train_timesteps, sigma_min, sigma_max, sampling_eps)
+    def create_state(self):
+        state = ScoreSdeVeSchedulerState.create()
+        return self.set_sigmas(
+            state,
+            self.config.num_train_timesteps,
+            self.config.sigma_min,
+            self.config.sigma_max,
+            self.config.sampling_eps,
+        )
 
     def set_timesteps(
-        self, state: ScoreSdeVeSchedulerState, num_inference_steps: int, shape: Tuple, sampling_eps: float = None
+        self, state: ScoreSdeVeSchedulerState, num_inference_steps: int, shape: Tuple = (), sampling_eps: float = None
     ) -> ScoreSdeVeSchedulerState:
         """
         Sets the continuous timesteps used for the diffusion chain. Supporting function to be run before inference.
@@ -193,8 +205,7 @@ def step_pred(
         # equation 6 in the paper: the model_output modeled by the network is grad_x log pt(x)
         # also equation 47 shows the analog from SDE models to ancestral sampling methods
         diffusion = diffusion.flatten()
-        while len(diffusion.shape) < len(sample.shape):
-            diffusion = diffusion[:, None]
+        diffusion = broadcast_to_shape_from_left(diffusion, sample.shape)
         drift = drift - diffusion**2 * model_output
 
         #  equation 6: sample noise for the diffusion term of
@@ -252,8 +263,7 @@ def step_correct(
 
         # compute corrected sample: model_output term and noise term
         step_size = step_size.flatten()
-        while len(step_size.shape) < len(sample.shape):
-            step_size = step_size[:, None]
+        step_size = broadcast_to_shape_from_left(step_size, sample.shape)
         prev_sample_mean = sample + step_size * model_output
         prev_sample = prev_sample_mean + ((step_size * 2) ** 0.5) * noise
 
diff --git a/src/diffusers/schedulers/scheduling_utils_flax.py b/src/diffusers/schedulers/scheduling_utils_flax.py
index 63de51f146f5..e545cfe2479e 100644
--- a/src/diffusers/schedulers/scheduling_utils_flax.py
+++ b/src/diffusers/schedulers/scheduling_utils_flax.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
+from typing import Tuple
 
 import jax.numpy as jnp
 
@@ -41,3 +42,8 @@ class FlaxSchedulerMixin:
     """
 
     config_name = SCHEDULER_CONFIG_NAME
+
+
+def broadcast_to_shape_from_left(x: jnp.ndarray, shape: Tuple[int]) -> jnp.ndarray:
+    assert len(shape) >= x.ndim
+    return jnp.broadcast_to(x.reshape(x.shape + (1,) * (len(shape) - x.ndim)), shape)
diff --git a/tests/test_scheduler_flax.py b/tests/test_scheduler_flax.py
new file mode 100644
index 000000000000..d2feaa752ae2
--- /dev/null
+++ b/tests/test_scheduler_flax.py
@@ -0,0 +1,863 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import tempfile
+import unittest
+from typing import Dict, List, Tuple
+
+from diffusers import FlaxDDIMScheduler, FlaxDDPMScheduler, FlaxPNDMScheduler
+from diffusers.utils import is_flax_available
+from diffusers.utils.testing_utils import require_flax
+
+
+if is_flax_available():
+    import jax.numpy as jnp
+    from jax import random
+
+
+@require_flax
+class FlaxSchedulerCommonTest(unittest.TestCase):
+    scheduler_classes = ()
+    forward_default_kwargs = ()
+
+    @property
+    def dummy_sample(self):
+        batch_size = 4
+        num_channels = 3
+        height = 8
+        width = 8
+
+        key1, key2 = random.split(random.PRNGKey(0))
+        sample = random.uniform(key1, (batch_size, num_channels, height, width))
+
+        return sample, key2
+
+    @property
+    def dummy_sample_deter(self):
+        batch_size = 4
+        num_channels = 3
+        height = 8
+        width = 8
+
+        num_elems = batch_size * num_channels * height * width
+        sample = jnp.arange(num_elems)
+        sample = sample.reshape(num_channels, height, width, batch_size)
+        sample = sample / num_elems
+        return jnp.transpose(sample, (3, 0, 1, 2))
+
+    def get_scheduler_config(self):
+        raise NotImplementedError
+
+    def dummy_model(self):
+        def model(sample, t, *args):
+            return sample * t / (t + 1)
+
+        return model
+
+    def check_over_configs(self, time_step=0, **config):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            sample, key = self.dummy_sample
+            residual = 0.1 * sample
+
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_config(tmpdirname)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            output = scheduler.step(state, residual, time_step, sample, key, **kwargs).prev_sample
+            new_output = new_scheduler.step(new_state, residual, time_step, sample, key, **kwargs).prev_sample
+
+            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def check_over_forward(self, time_step=0, **forward_kwargs):
+        kwargs = dict(self.forward_default_kwargs)
+        kwargs.update(forward_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            sample, key = self.dummy_sample
+            residual = 0.1 * sample
+
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_config(tmpdirname)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            output = scheduler.step(state, residual, time_step, sample, key, **kwargs).prev_sample
+            new_output = new_scheduler.step(new_state, residual, time_step, sample, key, **kwargs).prev_sample
+
+            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_from_pretrained_save_pretrained(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            sample, key = self.dummy_sample
+            residual = 0.1 * sample
+
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_config(tmpdirname)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            output = scheduler.step(state, residual, 1, sample, key, **kwargs).prev_sample
+            new_output = new_scheduler.step(new_state, residual, 1, sample, key, **kwargs).prev_sample
+
+            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            sample, key = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            output_0 = scheduler.step(state, residual, 0, sample, key, **kwargs).prev_sample
+            output_1 = scheduler.step(state, residual, 1, sample, key, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+    def test_scheduler_outputs_equivalence(self):
+        def set_nan_tensor_to_zero(t):
+            return t.at[t != t].set(0)
+
+        def recursive_check(tuple_object, dict_object):
+            if isinstance(tuple_object, (List, Tuple)):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif isinstance(tuple_object, Dict):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif tuple_object is None:
+                return
+            else:
+                self.assertTrue(
+                    jnp.allclose(set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5),
+                    msg=(
+                        "Tuple and dict output are not equal. Difference:"
+                        f" {jnp.max(jnp.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                        f" {jnp.isnan(tuple_object).any()} and `inf`: {jnp.isinf(tuple_object)}. Dict has"
+                        f" `nan`: {jnp.isnan(dict_object).any()} and `inf`: {jnp.isinf(dict_object)}."
+                    ),
+                )
+
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            sample, key = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            outputs_dict = scheduler.step(state, residual, 0, sample, key, **kwargs)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            outputs_tuple = scheduler.step(state, residual, 0, sample, key, return_dict=False, **kwargs)
+
+            recursive_check(outputs_tuple[0], outputs_dict.prev_sample)
+
+
+@require_flax
+class FlaxDDPMSchedulerTest(FlaxSchedulerCommonTest):
+    scheduler_classes = (FlaxDDPMScheduler,)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+            "variance_type": "fixed_small",
+            "clip_sample": True,
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def test_timesteps(self):
+        for timesteps in [1, 5, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "squaredcos_cap_v2"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_variance_type(self):
+        for variance in ["fixed_small", "fixed_large", "other"]:
+            self.check_over_configs(variance_type=variance)
+
+    def test_clip_sample(self):
+        for clip_sample in [True, False]:
+            self.check_over_configs(clip_sample=clip_sample)
+
+    def test_time_indices(self):
+        for t in [0, 500, 999]:
+            self.check_over_forward(time_step=t)
+
+    def test_variance(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        assert jnp.sum(jnp.abs(scheduler._get_variance(0) - 0.0)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(487) - 0.00979)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(999) - 0.02)) < 1e-5
+
+    def test_full_loop_no_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+        state = scheduler.create_state()
+
+        num_trained_timesteps = len(scheduler)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        key1, key2 = random.split(random.PRNGKey(0))
+
+        for t in reversed(range(num_trained_timesteps)):
+            # 1. predict noise residual
+            residual = model(sample, t)
+
+            # 2. predict previous mean of sample x_t-1
+            output = scheduler.step(state, residual, t, sample, key1)
+            pred_prev_sample = output.prev_sample
+            state = output.state
+            key1, key2 = random.split(key2)
+
+            # if t > 0:
+            #     noise = self.dummy_sample_deter
+            #     variance = scheduler.get_variance(t) ** (0.5) * noise
+            #
+            # sample = pred_prev_sample + variance
+            sample = pred_prev_sample
+
+        result_sum = jnp.sum(jnp.abs(sample))
+        result_mean = jnp.mean(jnp.abs(sample))
+
+        assert abs(result_sum - 255.1113) < 1e-2
+        assert abs(result_mean - 0.332176) < 1e-3
+
+
+@require_flax
+class FlaxDDIMSchedulerTest(FlaxSchedulerCommonTest):
+    scheduler_classes = (FlaxDDIMScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 50),)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def full_loop(self, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+        state = scheduler.create_state()
+        key1, key2 = random.split(random.PRNGKey(0))
+
+        num_inference_steps = 10
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+
+        state = scheduler.set_timesteps(state, num_inference_steps)
+
+        for t in state.timesteps:
+            residual = model(sample, t)
+            output = scheduler.step(state, residual, t, sample)
+            sample = output.prev_sample
+            state = output.state
+            key1, key2 = random.split(key2)
+
+        return sample
+
+    def check_over_configs(self, time_step=0, **config):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            sample, _ = self.dummy_sample
+            residual = 0.1 * sample
+
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_config(tmpdirname)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            output = scheduler.step(state, residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(new_state, residual, time_step, sample, **kwargs).prev_sample
+
+            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_from_pretrained_save_pretrained(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            sample, _ = self.dummy_sample
+            residual = 0.1 * sample
+
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_config(tmpdirname)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            output = scheduler.step(state, residual, 1, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(new_state, residual, 1, sample, **kwargs).prev_sample
+
+            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def check_over_forward(self, time_step=0, **forward_kwargs):
+        kwargs = dict(self.forward_default_kwargs)
+        kwargs.update(forward_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            sample, _ = self.dummy_sample
+            residual = 0.1 * sample
+
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_config(tmpdirname)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            output = scheduler.step(state, residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(new_state, residual, time_step, sample, **kwargs).prev_sample
+
+            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_scheduler_outputs_equivalence(self):
+        def set_nan_tensor_to_zero(t):
+            return t.at[t != t].set(0)
+
+        def recursive_check(tuple_object, dict_object):
+            if isinstance(tuple_object, (List, Tuple)):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif isinstance(tuple_object, Dict):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif tuple_object is None:
+                return
+            else:
+                self.assertTrue(
+                    jnp.allclose(set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5),
+                    msg=(
+                        "Tuple and dict output are not equal. Difference:"
+                        f" {jnp.max(jnp.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                        f" {jnp.isnan(tuple_object).any()} and `inf`: {jnp.isinf(tuple_object)}. Dict has"
+                        f" `nan`: {jnp.isnan(dict_object).any()} and `inf`: {jnp.isinf(dict_object)}."
+                    ),
+                )
+
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            sample, _ = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            outputs_dict = scheduler.step(state, residual, 0, sample, **kwargs)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            outputs_tuple = scheduler.step(state, residual, 0, sample, return_dict=False, **kwargs)
+
+            recursive_check(outputs_tuple[0], outputs_dict.prev_sample)
+
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            sample, _ = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            output_0 = scheduler.step(state, residual, 0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(state, residual, 1, sample, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+    def test_timesteps(self):
+        for timesteps in [100, 500, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_steps_offset(self):
+        for steps_offset in [0, 1]:
+            self.check_over_configs(steps_offset=steps_offset)
+
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(steps_offset=1)
+        scheduler = scheduler_class(**scheduler_config)
+        state = scheduler.create_state()
+        state = scheduler.set_timesteps(state, 5)
+        assert jnp.equal(state.timesteps, jnp.array([801, 601, 401, 201, 1])).all()
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "squaredcos_cap_v2"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_time_indices(self):
+        for t in [1, 10, 49]:
+            self.check_over_forward(time_step=t)
+
+    def test_inference_steps(self):
+        for t, num_inference_steps in zip([1, 10, 50], [10, 50, 500]):
+            self.check_over_forward(time_step=t, num_inference_steps=num_inference_steps)
+
+    def test_variance(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+        state = scheduler.create_state()
+
+        assert jnp.sum(jnp.abs(scheduler._get_variance(0, 0, state.alphas_cumprod) - 0.0)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(420, 400, state.alphas_cumprod) - 0.14771)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(980, 960, state.alphas_cumprod) - 0.32460)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(0, 0, state.alphas_cumprod) - 0.0)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(487, 486, state.alphas_cumprod) - 0.00979)) < 1e-5
+        assert jnp.sum(jnp.abs(scheduler._get_variance(999, 998, state.alphas_cumprod) - 0.02)) < 1e-5
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+
+        result_sum = jnp.sum(jnp.abs(sample))
+        result_mean = jnp.mean(jnp.abs(sample))
+
+        assert abs(result_sum - 172.0067) < 1e-2
+        assert abs(result_mean - 0.223967) < 1e-3
+
+    def test_full_loop_with_set_alpha_to_one(self):
+        # We specify different beta, so that the first alpha is 0.99
+        sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01)
+        result_sum = jnp.sum(jnp.abs(sample))
+        result_mean = jnp.mean(jnp.abs(sample))
+
+        assert abs(result_sum - 149.8295) < 1e-2
+        assert abs(result_mean - 0.1951) < 1e-3
+
+    def test_full_loop_with_no_set_alpha_to_one(self):
+        # We specify different beta, so that the first alpha is 0.99
+        sample = self.full_loop(set_alpha_to_one=False, beta_start=0.01)
+        result_sum = jnp.sum(jnp.abs(sample))
+        result_mean = jnp.mean(jnp.abs(sample))
+
+        assert abs(result_sum - 149.0784) < 1e-2
+        assert abs(result_mean - 0.1941) < 1e-3
+
+
+@require_flax
+class FlaxPNDMSchedulerTest(FlaxSchedulerCommonTest):
+    scheduler_classes = (FlaxPNDMScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 50),)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def check_over_configs(self, time_step=0, **config):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample, _ = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = jnp.array([residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05])
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+            state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
+            # copy over dummy past residuals
+            state = state.replace(ets=dummy_past_residuals[:])
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_config(tmpdirname)
+                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps, shape=sample.shape)
+                # copy over dummy past residuals
+                new_state = new_state.replace(ets=dummy_past_residuals[:])
+
+            (prev_sample, state) = scheduler.step_prk(state, residual, time_step, sample, **kwargs)
+            (new_prev_sample, new_state) = new_scheduler.step_prk(new_state, residual, time_step, sample, **kwargs)
+
+            assert jnp.sum(jnp.abs(prev_sample - new_prev_sample)) < 1e-5, "Scheduler outputs are not identical"
+
+            output, _ = scheduler.step_plms(state, residual, time_step, sample, **kwargs)
+            new_output, _ = new_scheduler.step_plms(new_state, residual, time_step, sample, **kwargs)
+
+            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_from_pretrained_save_pretrained(self):
+        pass
+
+    def test_scheduler_outputs_equivalence(self):
+        def set_nan_tensor_to_zero(t):
+            return t.at[t != t].set(0)
+
+        def recursive_check(tuple_object, dict_object):
+            if isinstance(tuple_object, (List, Tuple)):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif isinstance(tuple_object, Dict):
+                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
+                    recursive_check(tuple_iterable_value, dict_iterable_value)
+            elif tuple_object is None:
+                return
+            else:
+                self.assertTrue(
+                    jnp.allclose(set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5),
+                    msg=(
+                        "Tuple and dict output are not equal. Difference:"
+                        f" {jnp.max(jnp.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                        f" {jnp.isnan(tuple_object).any()} and `inf`: {jnp.isinf(tuple_object)}. Dict has"
+                        f" `nan`: {jnp.isnan(dict_object).any()} and `inf`: {jnp.isinf(dict_object)}."
+                    ),
+                )
+
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            sample, _ = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            outputs_dict = scheduler.step(state, residual, 0, sample, **kwargs)
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            outputs_tuple = scheduler.step(state, residual, 0, sample, return_dict=False, **kwargs)
+
+            recursive_check(outputs_tuple[0], outputs_dict.prev_sample)
+
+    def check_over_forward(self, time_step=0, **forward_kwargs):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample, _ = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = jnp.array([residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05])
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+            state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
+
+            # copy over dummy past residuals (must be after setting timesteps)
+            scheduler.ets = dummy_past_residuals[:]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_config(tmpdirname)
+                # copy over dummy past residuals
+                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps, shape=sample.shape)
+
+                # copy over dummy past residual (must be after setting timesteps)
+                new_state.replace(ets=dummy_past_residuals[:])
+
+            output, state = scheduler.step_prk(state, residual, time_step, sample, **kwargs)
+            new_output, new_state = new_scheduler.step_prk(new_state, residual, time_step, sample, **kwargs)
+
+            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+            output, _ = scheduler.step_plms(state, residual, time_step, sample, **kwargs)
+            new_output, _ = new_scheduler.step_plms(new_state, residual, time_step, sample, **kwargs)
+
+            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def full_loop(self, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+        state = scheduler.create_state()
+
+        num_inference_steps = 10
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
+
+        for i, t in enumerate(state.prk_timesteps):
+            residual = model(sample, t)
+            sample, state = scheduler.step_prk(state, residual, t, sample)
+
+        for i, t in enumerate(state.plms_timesteps):
+            residual = model(sample, t)
+            sample, state = scheduler.step_plms(state, residual, t, sample)
+
+        return sample
+
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            sample, _ = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            # copy over dummy past residuals (must be done after set_timesteps)
+            dummy_past_residuals = jnp.array([residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05])
+            state = state.replace(ets=dummy_past_residuals[:])
+
+            output_0, state = scheduler.step_prk(state, residual, 0, sample, **kwargs)
+            output_1, state = scheduler.step_prk(state, residual, 1, sample, **kwargs)
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+            output_0, state = scheduler.step_plms(state, residual, 0, sample, **kwargs)
+            output_1, state = scheduler.step_plms(state, residual, 1, sample, **kwargs)
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+    def test_timesteps(self):
+        for timesteps in [100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_steps_offset(self):
+        for steps_offset in [0, 1]:
+            self.check_over_configs(steps_offset=steps_offset)
+
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(steps_offset=1)
+        scheduler = scheduler_class(**scheduler_config)
+        state = scheduler.create_state()
+        state = scheduler.set_timesteps(state, 10, shape=())
+        assert jnp.equal(
+            state.timesteps,
+            jnp.array([901, 851, 851, 801, 801, 751, 751, 701, 701, 651, 651, 601, 601, 501, 401, 301, 201, 101, 1]),
+        ).all()
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.0001, 0.001], [0.002, 0.02]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "squaredcos_cap_v2"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_time_indices(self):
+        for t in [1, 5, 10]:
+            self.check_over_forward(time_step=t)
+
+    def test_inference_steps(self):
+        for t, num_inference_steps in zip([1, 5, 10], [10, 50, 100]):
+            self.check_over_forward(num_inference_steps=num_inference_steps)
+
+    def test_pow_of_3_inference_steps(self):
+        # earlier version of set_timesteps() caused an error indexing alpha's with inference steps as power of 3
+        num_inference_steps = 27
+
+        for scheduler_class in self.scheduler_classes:
+            sample, _ = self.dummy_sample
+            residual = 0.1 * sample
+
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
+
+            # before power of 3 fix, would error on first step, so we only need to do two
+            for i, t in enumerate(state.prk_timesteps[:2]):
+                sample, state = scheduler.step_prk(state, residual, t, sample)
+
+    def test_inference_plms_no_past_residuals(self):
+        with self.assertRaises(ValueError):
+            scheduler_class = self.scheduler_classes[0]
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            state = scheduler.create_state()
+
+            scheduler.step_plms(state, self.dummy_sample, 1, self.dummy_sample).prev_sample
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+        result_sum = jnp.sum(jnp.abs(sample))
+        result_mean = jnp.mean(jnp.abs(sample))
+
+        assert abs(result_sum - 198.1318) < 1e-2
+        assert abs(result_mean - 0.2580) < 1e-3
+
+    def test_full_loop_with_set_alpha_to_one(self):
+        # We specify different beta, so that the first alpha is 0.99
+        sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01)
+        result_sum = jnp.sum(jnp.abs(sample))
+        result_mean = jnp.mean(jnp.abs(sample))
+
+        assert abs(result_sum - 186.9466) < 1e-2
+        assert abs(result_mean - 0.24342) < 1e-3
+
+    def test_full_loop_with_no_set_alpha_to_one(self):
+        # We specify different beta, so that the first alpha is 0.99
+        sample = self.full_loop(set_alpha_to_one=False, beta_start=0.01)
+        result_sum = jnp.sum(jnp.abs(sample))
+        result_mean = jnp.mean(jnp.abs(sample))
+
+        assert abs(result_sum - 186.9482) < 1e-2
+        assert abs(result_mean - 0.2434) < 1e-3

From 28b134e6277a168381eadfb522c946f09cf3beff Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Tue, 25 Oct 2022 15:28:08 +0200
Subject: [PATCH 040/269] [Tests] Fix `mps` reproducibility issue when running
 with pytest-xdist (#976)

* [WIP] Debugging mps DDIM tests

* revert num_steps

* check warmup with a generator

* more warmup!

* remove xdist

* just use a single process
---
 .github/workflows/pr_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
index 493ff51484b0..cf21edf99165 100644
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -89,7 +89,7 @@ jobs:
     - name: Run all fast tests on MPS
       shell: arch -arch arm64 bash {0}
       run: |
-        ${CONDA_RUN} python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=tests_torch_mps tests/
+        ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps tests/
 
     - name: Failure short reports
       if: ${{ failure() }}

From 3d02c92187044393d90cc4768f9d0a0939322833 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Tue, 25 Oct 2022 16:41:51 +0200
Subject: [PATCH 041/269] mps changes for PyTorch 1.13 (#926)

* Docs: refer to pre-RC version of PyTorch 1.13.0.

* Remove temporary workaround for unavailable op.

* Update comment to make it less ambiguous.

* Remove use of contiguous in mps.

It appears to not longer be necessary.

* Special case: use einsum for much better performance in mps

* Update mps docs.

* Minor doc update.

* Accept suggestion

Co-authored-by: Anton Lozhkov <anton@huggingface.co>

Co-authored-by: Anton Lozhkov <anton@huggingface.co>
---
 docs/source/optimization/mps.mdx              | 30 ++++++++++-------
 .../community/clip_guided_stable_diffusion.py |  2 +-
 .../community/interpolate_stable_diffusion.py |  2 +-
 src/diffusers/models/attention.py             | 32 +++++++++++++++----
 src/diffusers/models/resnet.py                |  4 ---
 .../pipeline_stable_diffusion.py              |  2 +-
 6 files changed, 47 insertions(+), 25 deletions(-)

diff --git a/docs/source/optimization/mps.mdx b/docs/source/optimization/mps.mdx
index 754adae5d9b2..4eeabc658737 100644
--- a/docs/source/optimization/mps.mdx
+++ b/docs/source/optimization/mps.mdx
@@ -17,9 +17,13 @@ specific language governing permissions and limitations under the License.
 ## Requirements
 
 - Mac computer with Apple silicon (M1/M2) hardware.
-- macOS 12.3 or later.
+- macOS 12.6 or later (13.0 or later recommended).
 - arm64 version of Python.
-- PyTorch [Preview (Nightly)](https://pytorch.org/get-started/locally/), version `1.14.0.dev20221007` or later.
+- PyTorch 1.13.0 RC (Release Candidate). You can install it with `pip` using:
+
+```
+pip3 install --pre torch --extra-index-url https://download.pytorch.org/whl/test/cpu
+```
 
 ## Inference Pipeline
 
@@ -34,6 +38,9 @@ from diffusers import StableDiffusionPipeline
 pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 pipe = pipe.to("mps")
 
+# Recommended if your computer has < 64 GB of RAM
+pipe.enable_attention_slicing()
+
 prompt = "a photo of an astronaut riding a horse on mars"
 
 # First-time "warmup" pass (see explanation above)
@@ -43,16 +50,17 @@ _ = pipe(prompt, num_inference_steps=1)
 image = pipe(prompt).images[0]
 ```
 
-## Known Issues
+## Performance Recommendations
 
-- As mentioned above, we are investigating a strange [first-time inference issue](https://github.com/huggingface/diffusers/issues/372).
-- Generating multiple prompts in a batch [crashes or doesn't work reliably](https://github.com/huggingface/diffusers/issues/363). We believe this might be related to the [`mps` backend in PyTorch](https://github.com/pytorch/pytorch/issues/84039#issuecomment-1237735249), but we need to investigate in more depth. For now, we recommend to iterate instead of batching.
+M1/M2 performance is very sensitive to memory pressure. The system will automatically swap if it needs to, but performance will degrade significantly when it does.
 
-## Performance
+We recommend you use _attention slicing_ to reduce memory pressure during inference and prevent swapping, particularly if your computer has lass than 64 GB of system RAM, or if you generate images at non-standard resolutions larger than 512 × 512 pixels. Attention slicing performs the costly attention operation in multiple steps instead of all at once. It usually has a performance impact of ~20% in computers without universal memory, but we have observed _better performance_ in most Apple Silicon computers, unless you have 64 GB or more.
 
-These are the results we got on a M1 Max MacBook Pro with 64 GB of RAM, running macOS Ventura Version 13.0 Beta (22A5331f). We performed Stable Diffusion text-to-image generation of the same prompt for 50 inference steps, using a guidance scale of 7.5.
+```python
+pipeline.enable_attention_slicing()
+```
 
-| Device | Steps | Time    |
-|--------|-------|---------|
-| CPU    | 50    | 213.46s |
-| MPS    | 50    | 30.81s  |
\ No newline at end of file
+## Known Issues
+
+- As mentioned above, we are investigating a strange [first-time inference issue](https://github.com/huggingface/diffusers/issues/372).
+- Generating multiple prompts in a batch [crashes or doesn't work reliably](https://github.com/huggingface/diffusers/issues/363). We believe this is related to the [`mps` backend in PyTorch](https://github.com/pytorch/pytorch/issues/84039). For now, we recommend to iterate instead of batching.
diff --git a/examples/community/clip_guided_stable_diffusion.py b/examples/community/clip_guided_stable_diffusion.py
index 974f4ab2e883..2c86e9130fdc 100644
--- a/examples/community/clip_guided_stable_diffusion.py
+++ b/examples/community/clip_guided_stable_diffusion.py
@@ -249,7 +249,7 @@ def __call__(
         latents_dtype = text_embeddings.dtype
         if latents is None:
             if self.device.type == "mps":
-                # randn does not exist on mps
+                # randn does not work reproducibly on mps
                 latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
                     self.device
                 )
diff --git a/examples/community/interpolate_stable_diffusion.py b/examples/community/interpolate_stable_diffusion.py
index 97116bdc77b4..bbb1b0f9e633 100644
--- a/examples/community/interpolate_stable_diffusion.py
+++ b/examples/community/interpolate_stable_diffusion.py
@@ -324,7 +324,7 @@ def __call__(
         latents_dtype = text_embeddings.dtype
         if latents is None:
             if self.device.type == "mps":
-                # randn does not exist on mps
+                # randn does not work reproducibly on mps
                 latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
                     self.device
                 )
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 4906e10f27c4..dce30c6a4aa6 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -207,7 +207,6 @@ def _set_attention_slice(self, slice_size):
         self.attn2._slice_size = slice_size
 
     def forward(self, hidden_states, context=None):
-        hidden_states = hidden_states.contiguous() if hidden_states.device.type == "mps" else hidden_states
         hidden_states = self.attn1(self.norm1(hidden_states)) + hidden_states
         hidden_states = self.attn2(self.norm2(hidden_states), context=context) + hidden_states
         hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
@@ -288,10 +287,19 @@ def forward(self, hidden_states, context=None, mask=None):
 
     def _attention(self, query, key, value):
         # TODO: use baddbmm for better performance
-        attention_scores = torch.matmul(query, key.transpose(-1, -2)) * self.scale
+        if query.device.type == "mps":
+            # Better performance on mps (~20-25%)
+            attention_scores = torch.einsum("b i d, b j d -> b i j", query, key) * self.scale
+        else:
+            attention_scores = torch.matmul(query, key.transpose(-1, -2)) * self.scale
         attention_probs = attention_scores.softmax(dim=-1)
         # compute attention output
-        hidden_states = torch.matmul(attention_probs, value)
+
+        if query.device.type == "mps":
+            hidden_states = torch.einsum("b i j, b j d -> b i d", attention_probs, value)
+        else:
+            hidden_states = torch.matmul(attention_probs, value)
+
         # reshape hidden_states
         hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
         return hidden_states
@@ -305,11 +313,21 @@ def _sliced_attention(self, query, key, value, sequence_length, dim):
         for i in range(hidden_states.shape[0] // slice_size):
             start_idx = i * slice_size
             end_idx = (i + 1) * slice_size
-            attn_slice = (
-                torch.matmul(query[start_idx:end_idx], key[start_idx:end_idx].transpose(1, 2)) * self.scale
-            )  # TODO: use baddbmm for better performance
+            if query.device.type == "mps":
+                # Better performance on mps (~20-25%)
+                attn_slice = (
+                    torch.einsum("b i d, b j d -> b i j", query[start_idx:end_idx], key[start_idx:end_idx])
+                    * self.scale
+                )
+            else:
+                attn_slice = (
+                    torch.matmul(query[start_idx:end_idx], key[start_idx:end_idx].transpose(1, 2)) * self.scale
+                )  # TODO: use baddbmm for better performance
             attn_slice = attn_slice.softmax(dim=-1)
-            attn_slice = torch.matmul(attn_slice, value[start_idx:end_idx])
+            if query.device.type == "mps":
+                attn_slice = torch.einsum("b i j, b j d -> b i d", attn_slice, value[start_idx:end_idx])
+            else:
+                attn_slice = torch.matmul(attn_slice, value[start_idx:end_idx])
 
             hidden_states[start_idx:end_idx] = attn_slice
 
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index d4cb367ebc0b..fbd78b512a6b 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -492,10 +492,6 @@ def upfirdn2d_native(tensor, kernel, up=1, down=1, pad=(0, 0)):
     kernel_h, kernel_w = kernel.shape
 
     out = tensor.view(-1, in_h, 1, in_w, 1, minor)
-
-    # Temporary workaround for mps specific issue: https://github.com/pytorch/pytorch/issues/84535
-    if tensor.device.type == "mps":
-        out = out.to("cpu")
     out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1])
     out = out.view(-1, in_h * up_y, in_w * up_x, minor)
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 46def26e94cc..02a6b45fdefc 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -287,7 +287,7 @@ def __call__(
         latents_dtype = text_embeddings.dtype
         if latents is None:
             if self.device.type == "mps":
-                # randn does not exist on mps
+                # randn does not work reproducibly on mps
                 latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
                     self.device
                 )

From 0b42b074b4f6e2eff26c11743f8f7d0e6ad9d79c Mon Sep 17 00:00:00 2001
From: SkyTNT <SKYTNT@outlook.com>
Date: Tue, 25 Oct 2022 22:48:53 +0800
Subject: [PATCH 042/269] [Onnx] support half-precision and fix bugs for onnx
 pipelines (#932)

* [Onnx] support half-precision and fix bugs for onnx pipelines

* Update convert_stable_diffusion_checkpoint_to_onnx.py

* style

* fix has_nsfw_concept

* Update convert_stable_diffusion_checkpoint_to_onnx.py

* fix style
---
 ...ert_stable_diffusion_checkpoint_to_onnx.py | 33 ++++++++++-----
 .../pipeline_onnx_stable_diffusion.py         | 35 +++++++++++++---
 .../pipeline_onnx_stable_diffusion_img2img.py | 34 +++++++++++++---
 .../pipeline_onnx_stable_diffusion_inpaint.py | 40 +++++++++++++++----
 4 files changed, 112 insertions(+), 30 deletions(-)

diff --git a/scripts/convert_stable_diffusion_checkpoint_to_onnx.py b/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
index 2a2c2031a7b5..8e0b58c56df3 100644
--- a/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
+++ b/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
@@ -69,8 +69,15 @@ def onnx_export(
 
 
 @torch.no_grad()
-def convert_models(model_path: str, output_path: str, opset: int):
-    pipeline = StableDiffusionPipeline.from_pretrained(model_path)
+def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = False):
+    dtype = torch.float16 if fp16 else torch.float32
+    if fp16 and torch.cuda.is_available():
+        device = "cuda"
+    elif fp16 and not torch.cuda.is_available():
+        raise ValueError("`float16` model export is only supported on GPUs with CUDA")
+    else:
+        device = "cpu"
+    pipeline = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=dtype).to(device)
     output_path = Path(output_path)
 
     # TEXT ENCODER
@@ -84,7 +91,7 @@ def convert_models(model_path: str, output_path: str, opset: int):
     onnx_export(
         pipeline.text_encoder,
         # casting to torch.int32 until the CLIP fix is released: https://github.com/huggingface/transformers/pull/18515/files
-        model_args=(text_input.input_ids.to(torch.int32)),
+        model_args=(text_input.input_ids.to(device=device, dtype=torch.int32)),
         output_path=output_path / "text_encoder" / "model.onnx",
         ordered_input_names=["input_ids"],
         output_names=["last_hidden_state", "pooler_output"],
@@ -100,9 +107,9 @@ def convert_models(model_path: str, output_path: str, opset: int):
     onnx_export(
         pipeline.unet,
         model_args=(
-            torch.randn(2, pipeline.unet.in_channels, 64, 64),
-            torch.LongTensor([0, 1]),
-            torch.randn(2, 77, 768),
+            torch.randn(2, pipeline.unet.in_channels, 64, 64).to(device=device, dtype=dtype),
+            torch.LongTensor([0, 1]).to(device=device),
+            torch.randn(2, 77, 768).to(device=device, dtype=dtype),
             False,
         ),
         output_path=unet_path,
@@ -139,7 +146,7 @@ def convert_models(model_path: str, output_path: str, opset: int):
     vae_encoder.forward = lambda sample, return_dict: vae_encoder.encode(sample, return_dict)[0].sample()
     onnx_export(
         vae_encoder,
-        model_args=(torch.randn(1, 3, 512, 512), False),
+        model_args=(torch.randn(1, 3, 512, 512).to(device=device, dtype=dtype), False),
         output_path=output_path / "vae_encoder" / "model.onnx",
         ordered_input_names=["sample", "return_dict"],
         output_names=["latent_sample"],
@@ -155,7 +162,7 @@ def convert_models(model_path: str, output_path: str, opset: int):
     vae_decoder.forward = vae_encoder.decode
     onnx_export(
         vae_decoder,
-        model_args=(torch.randn(1, 4, 64, 64), False),
+        model_args=(torch.randn(1, 4, 64, 64).to(device=device, dtype=dtype), False),
         output_path=output_path / "vae_decoder" / "model.onnx",
         ordered_input_names=["latent_sample", "return_dict"],
         output_names=["sample"],
@@ -171,13 +178,16 @@ def convert_models(model_path: str, output_path: str, opset: int):
     safety_checker.forward = safety_checker.forward_onnx
     onnx_export(
         pipeline.safety_checker,
-        model_args=(torch.randn(1, 3, 224, 224), torch.randn(1, 512, 512, 3)),
+        model_args=(
+            torch.randn(1, 3, 224, 224).to(device=device, dtype=dtype),
+            torch.randn(1, 512, 512, 3).to(device=device, dtype=dtype),
+        ),
         output_path=output_path / "safety_checker" / "model.onnx",
         ordered_input_names=["clip_input", "images"],
         output_names=["out_images", "has_nsfw_concepts"],
         dynamic_axes={
             "clip_input": {0: "batch", 1: "channels", 2: "height", 3: "width"},
-            "images": {0: "batch", 1: "channels", 2: "height", 3: "width"},
+            "images": {0: "batch", 1: "height", 2: "width", 3: "channels"},
         },
         opset=opset,
     )
@@ -221,7 +231,8 @@ def convert_models(model_path: str, output_path: str, opset: int):
         type=int,
         help="The version of the ONNX operator set to use.",
     )
+    parser.add_argument("--fp16", action="store_true", default=False, help="Export the models in `float16` mode")
 
     args = parser.parse_args()
 
-    convert_models(args.model_path, args.output_path, args.opset)
+    convert_models(args.model_path, args.output_path, args.opset, args.fp16)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
index 9fd0a635a8d4..a2cb99be421f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
@@ -55,7 +55,9 @@ def __call__(
         num_inference_steps: Optional[int] = 50,
         guidance_scale: Optional[float] = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
         eta: Optional[float] = 0.0,
+        generator: Optional[np.random.RandomState] = None,
         latents: Optional[np.ndarray] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
@@ -81,6 +83,9 @@ def __call__(
                 f" {type(callback_steps)}."
             )
 
+        if generator is None:
+            generator = np.random
+
         # get prompt text embeddings
         text_inputs = self.tokenizer(
             prompt,
@@ -98,6 +103,7 @@ def __call__(
             )
             text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
         text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+        text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0)
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -133,6 +139,7 @@ def __call__(
                 return_tensors="np",
             )
             uncond_embeddings = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+            uncond_embeddings = np.repeat(uncond_embeddings, num_images_per_prompt, axis=0)
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
@@ -140,9 +147,10 @@ def __call__(
             text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
 
         # get the initial random noise unless the user supplied it
-        latents_shape = (batch_size, 4, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        latents_shape = (batch_size * num_images_per_prompt, 4, height // 8, width // 8)
         if latents is None:
-            latents = np.random.randn(*latents_shape).astype(np.float32)
+            latents = generator.randn(*latents_shape).astype(latents_dtype)
         elif latents.shape != latents_shape:
             raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
 
@@ -185,13 +193,30 @@ def __call__(
                 callback(i, t, latents)
 
         latents = 1 / 0.18215 * latents
-        image = self.vae_decoder(latent_sample=latents)[0]
+        # image = self.vae_decoder(latent_sample=latents)[0]
+        # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+        image = np.concatenate(
+            [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
+        )
 
         image = np.clip(image / 2 + 0.5, 0, 1)
         image = image.transpose((0, 2, 3, 1))
 
-        safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="np")
-        image, has_nsfw_concept = self.safety_checker(clip_input=safety_checker_input.pixel_values, images=image)
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="np"
+            ).pixel_values.astype(image.dtype)
+            # There will throw an error if use safety_checker batchsize>1
+            images, has_nsfw_concept = [], []
+            for i in range(image.shape[0]):
+                image_i, has_nsfw_concept_i = self.safety_checker(
+                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+                )
+                images.append(image_i)
+                has_nsfw_concept.append(has_nsfw_concept_i[0])
+            image = np.concatenate(images)
+        else:
+            has_nsfw_concept = None
 
         if output_type == "pil":
             image = self.numpy_to_pil(image)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
index ce3f3fbacbc7..b2105458b574 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -121,6 +121,7 @@ def __call__(
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: Optional[int] = 1,
         eta: Optional[float] = 0.0,
+        generator: Optional[np.random.RandomState] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
@@ -159,6 +160,8 @@ def __call__(
             eta (`float`, *optional*, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generate image. Choose between
                 [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -197,6 +200,9 @@ def __call__(
                 f" {type(callback_steps)}."
             )
 
+        if generator is None:
+            generator = np.random
+
         # set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
 
@@ -239,7 +245,7 @@ def __call__(
                     f" {type(prompt)}."
                 )
             elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
+                uncond_tokens = [negative_prompt] * batch_size
             elif batch_size != len(negative_prompt):
                 raise ValueError("The length of `negative_prompt` should be equal to batch_size.")
             else:
@@ -257,13 +263,15 @@ def __call__(
             uncond_embeddings = self.text_encoder(input_ids=uncond_input_ids.astype(np.int32))[0]
 
             # duplicate unconditional embeddings for each generation per prompt
-            uncond_embeddings = np.repeat(uncond_embeddings, batch_size * num_images_per_prompt, axis=0)
+            uncond_embeddings = np.repeat(uncond_embeddings, num_images_per_prompt, axis=0)
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
             text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
 
+        latents_dtype = text_embeddings.dtype
+        init_image = init_image.astype(latents_dtype)
         # encode the init image into latents and scale the latents
         init_latents = self.vae_encoder(sample=init_image)[0]
         init_latents = 0.18215 * init_latents
@@ -297,7 +305,7 @@ def __call__(
         timesteps = np.array([timesteps] * batch_size * num_images_per_prompt)
 
         # add noise to latents using the timesteps
-        noise = np.random.randn(*init_latents.shape).astype(np.float32)
+        noise = generator.randn(*init_latents.shape).astype(latents_dtype)
         init_latents = self.scheduler.add_noise(
             torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
         )
@@ -341,14 +349,28 @@ def __call__(
                 callback(i, t, latents)
 
         latents = 1 / 0.18215 * latents
-        image = self.vae_decoder(latent_sample=latents)[0]
+        # image = self.vae_decoder(latent_sample=latents)[0]
+        # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+        image = np.concatenate(
+            [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
+        )
 
         image = np.clip(image / 2 + 0.5, 0, 1)
         image = image.transpose((0, 2, 3, 1))
 
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="np")
-            image, has_nsfw_concept = self.safety_checker(clip_input=safety_checker_input.pixel_values, images=image)
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="np"
+            ).pixel_values.astype(image.dtype)
+            # There will throw an error if use safety_checker batchsize>1
+            images, has_nsfw_concept = [], []
+            for i in range(image.shape[0]):
+                image_i, has_nsfw_concept_i = self.safety_checker(
+                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+                )
+                images.append(image_i)
+                has_nsfw_concept.append(has_nsfw_concept_i[0])
+            image = np.concatenate(images)
         else:
             has_nsfw_concept = None
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
index b45d968f66e3..37374aa4b1bc 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -23,11 +23,11 @@
 
 
 def prepare_mask_and_masked_image(image, mask, latents_shape):
-    image = np.array(image.convert("RGB"))
+    image = np.array(image.convert("RGB").resize((latents_shape[1] * 8, latents_shape[0] * 8)))
     image = image[None].transpose(0, 3, 1, 2)
     image = image.astype(np.float32) / 127.5 - 1.0
 
-    image_mask = np.array(mask.convert("L"))
+    image_mask = np.array(mask.convert("L").resize((latents_shape[1] * 8, latents_shape[0] * 8)))
     masked_image = image * (image_mask < 127.5)
 
     mask = mask.resize((latents_shape[1], latents_shape[0]), PIL.Image.NEAREST)
@@ -138,6 +138,7 @@ def __call__(
         negative_prompt: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: Optional[int] = 1,
         eta: float = 0.0,
+        generator: Optional[np.random.RandomState] = None,
         latents: Optional[np.ndarray] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
@@ -180,6 +181,8 @@ def __call__(
             eta (`float`, *optional*, defaults to 0.0):
                 Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                 [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
             latents (`np.ndarray`, *optional*):
                 Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                 generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
@@ -222,6 +225,9 @@ def __call__(
                 f" {type(callback_steps)}."
             )
 
+        if generator is None:
+            generator = np.random
+
         # set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
 
@@ -261,7 +267,7 @@ def __call__(
                     f" {type(prompt)}."
                 )
             elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
+                uncond_tokens = [negative_prompt] * batch_size
             elif batch_size != len(negative_prompt):
                 raise ValueError(
                     f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
@@ -283,7 +289,7 @@ def __call__(
             uncond_embeddings = self.text_encoder(input_ids=uncond_input_ids.astype(np.int32))[0]
 
             # duplicate unconditional embeddings for each generation per prompt
-            uncond_embeddings = np.repeat(uncond_embeddings, batch_size * num_images_per_prompt, axis=0)
+            uncond_embeddings = np.repeat(uncond_embeddings, num_images_per_prompt, axis=0)
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
@@ -294,7 +300,7 @@ def __call__(
         latents_shape = (batch_size * num_images_per_prompt, num_channels_latents, height // 8, width // 8)
         latents_dtype = text_embeddings.dtype
         if latents is None:
-            latents = np.random.randn(*latents_shape).astype(latents_dtype)
+            latents = generator.randn(*latents_shape).astype(latents_dtype)
         else:
             if latents.shape != latents_shape:
                 raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
@@ -307,6 +313,10 @@ def __call__(
         masked_image_latents = self.vae_encoder(sample=masked_image)[0]
         masked_image_latents = 0.18215 * masked_image_latents
 
+        # duplicate mask and masked_image_latents for each generation per prompt
+        mask = mask.repeat(batch_size * num_images_per_prompt, 0)
+        masked_image_latents = masked_image_latents.repeat(batch_size * num_images_per_prompt, 0)
+
         mask = np.concatenate([mask] * 2) if do_classifier_free_guidance else mask
         masked_image_latents = (
             np.concatenate([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
@@ -367,14 +377,28 @@ def __call__(
                 callback(i, t, latents)
 
         latents = 1 / 0.18215 * latents
-        image = self.vae_decoder(latent_sample=latents)[0]
+        # image = self.vae_decoder(latent_sample=latents)[0]
+        # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+        image = np.concatenate(
+            [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
+        )
 
         image = np.clip(image / 2 + 0.5, 0, 1)
         image = image.transpose((0, 2, 3, 1))
 
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="np")
-            image, has_nsfw_concept = self.safety_checker(clip_input=safety_checker_input.pixel_values, images=image)
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="np"
+            ).pixel_values.astype(image.dtype)
+            # There will throw an error if use safety_checker batchsize>1
+            images, has_nsfw_concept = [], []
+            for i in range(image.shape[0]):
+                image_i, has_nsfw_concept_i = self.safety_checker(
+                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+                )
+                images.append(image_i)
+                has_nsfw_concept.append(has_nsfw_concept_i[0])
+            image = np.concatenate(images)
         else:
             has_nsfw_concept = None
 

From 88fa6b7d68d77b2531462ebe5a339b8c5b034ce4 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 25 Oct 2022 18:39:25 +0200
Subject: [PATCH 043/269] [Dance Diffusion] Add dance diffusion (#803)

* start

* add more logic

* Update src/diffusers/models/unet_2d_condition_flax.py

* match weights

* up

* make model work

* making class more general, fixing missed file rename

* small fix

* make new conversion work

* up

* finalize conversion

* up

* first batch of variable renamings

* remove c and c_prev var names

* add mid and out block structure

* add pipeline

* up

* finish conversion

* finish

* upload

* more fixes

* Apply suggestions from code review

* add attr

* up

* uP

* up

* finish tests

* finish

* uP

* finish

* fix test

* up

* naming consistency in tests

* Apply suggestions from code review

Co-authored-by: Suraj Patil <surajp815@gmail.com>
Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
Co-authored-by: Nathan Lambert <nathan@huggingface.co>
Co-authored-by: Anton Lozhkov <anton@huggingface.co>

* remove hardcoded 16

* Remove bogus

* fix some stuff

* finish

* improve logging

* docs

* upload

Co-authored-by: Nathan Lambert <nol@berkeley.edu>
Co-authored-by: Suraj Patil <surajp815@gmail.com>
Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
Co-authored-by: Nathan Lambert <nathan@huggingface.co>
Co-authored-by: Anton Lozhkov <anton@huggingface.co>
---
 docs/source/_toctree.yml                      |   2 +
 docs/source/api/models.mdx                    |   3 +
 docs/source/api/pipelines/dance_diffusion.mdx |  33 ++
 docs/source/api/schedulers.mdx                |   4 +
 .../convert_dance_diffusion_to_diffusers.py   | 339 ++++++++++++++++
 src/diffusers/__init__.py                     |  13 +-
 src/diffusers/models/__init__.py              |   1 +
 src/diffusers/models/embeddings.py            |  23 +-
 src/diffusers/models/unet_1d.py               | 172 ++++++++
 src/diffusers/models/unet_1d_blocks.py        | 384 ++++++++++++++++++
 src/diffusers/models/unet_2d.py               |   2 +-
 .../{unet_blocks.py => unet_2d_blocks.py}     |   0
 ..._blocks_flax.py => unet_2d_blocks_flax.py} |   0
 src/diffusers/models/unet_2d_condition.py     |   2 +-
 .../models/unet_2d_condition_flax.py          |   2 +-
 src/diffusers/models/vae.py                   |   2 +-
 src/diffusers/pipeline_utils.py               |  14 +
 src/diffusers/pipelines/__init__.py           |   1 +
 .../pipelines/dance_diffusion/__init__.py     |   2 +
 .../pipeline_dance_diffusion.py               | 113 ++++++
 src/diffusers/schedulers/__init__.py          |   1 +
 src/diffusers/schedulers/scheduling_ipndm.py  | 152 +++++++
 src/diffusers/utils/dummy_pt_objects.py       |  45 ++
 .../dance_diffusion/test_dance_diffusion.py   | 101 +++++
 tests/test_models_unet_1d.py                  |  45 ++
 ..._models_unet.py => test_models_unet_2d.py} |   2 +-
 tests/test_scheduler.py                       | 171 +++++++-
 27 files changed, 1613 insertions(+), 16 deletions(-)
 create mode 100644 docs/source/api/pipelines/dance_diffusion.mdx
 create mode 100755 scripts/convert_dance_diffusion_to_diffusers.py
 create mode 100644 src/diffusers/models/unet_1d.py
 create mode 100644 src/diffusers/models/unet_1d_blocks.py
 rename src/diffusers/models/{unet_blocks.py => unet_2d_blocks.py} (100%)
 rename src/diffusers/models/{unet_blocks_flax.py => unet_2d_blocks_flax.py} (100%)
 create mode 100644 src/diffusers/pipelines/dance_diffusion/__init__.py
 create mode 100644 src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
 create mode 100644 src/diffusers/schedulers/scheduling_ipndm.py
 create mode 100644 tests/pipelines/dance_diffusion/test_dance_diffusion.py
 create mode 100644 tests/test_models_unet_1d.py
 rename tests/{test_models_unet.py => test_models_unet_2d.py} (99%)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index a0358c32e4ec..43a4fc4bf11e 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -92,5 +92,7 @@
       title: "Stable Diffusion"
     - local: api/pipelines/stochastic_karras_ve
       title: "Stochastic Karras VE"
+    - local: api/pipelines/dance_diffusion
+      title: "Dance Diffusion"
     title: "Pipelines"
   title: "API"
diff --git a/docs/source/api/models.mdx b/docs/source/api/models.mdx
index c92fdccb8333..c3f5e65edfbd 100644
--- a/docs/source/api/models.mdx
+++ b/docs/source/api/models.mdx
@@ -22,6 +22,9 @@ The models are built on the base class ['ModelMixin'] that is a `torch.nn.module
 ## UNet2DOutput
 [[autodoc]] models.unet_2d.UNet2DOutput
 
+## UNet1DModel
+[[autodoc]] UNet1DModel
+
 ## UNet2DModel
 [[autodoc]] UNet2DModel
 
diff --git a/docs/source/api/pipelines/dance_diffusion.mdx b/docs/source/api/pipelines/dance_diffusion.mdx
new file mode 100644
index 000000000000..4d969bf6f032
--- /dev/null
+++ b/docs/source/api/pipelines/dance_diffusion.mdx
@@ -0,0 +1,33 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Dance Diffusion
+
+## Overview
+
+[Dance Diffusion](https://github.com/Harmonai-org/sample-generator) by Zach Evans.
+
+Dance Diffusion is the first in a suite of generative audio tools for producers and musicians to be released by Harmonai.
+For more info or to get involved in the development of these tools, please visit https://harmonai.org and fill out the form on the front page.
+
+The original codebase of this implementation can be found [here](https://github.com/Harmonai-org/sample-generator).
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Colab
+|---|---|:---:|
+| [pipeline_dance_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py) | *Unconditional Audio Generation* | - |
+
+
+## DanceDiffusionPipeline
+[[autodoc]] DanceDiffusionPipeline
+    - __call__
diff --git a/docs/source/api/schedulers.mdx b/docs/source/api/schedulers.mdx
index 12a6b5c587bc..80fbfdcb2321 100644
--- a/docs/source/api/schedulers.mdx
+++ b/docs/source/api/schedulers.mdx
@@ -95,6 +95,10 @@ Original paper can be found [here](https://arxiv.org/abs/2011.13456).
 
 [[autodoc]] ScoreSdeVeScheduler
 
+#### improved pseudo numerical methods for diffusion models (iPNDM)
+
+Original implementation can be found [here](https://github.com/crowsonkb/v-diffusion-pytorch/blob/987f8985e38208345c1959b0ea767a625831cc9b/diffusion/sampling.py#L296).
+
 #### variance preserving stochastic differential equation (SDE) scheduler
 
 Original paper can be found [here](https://arxiv.org/abs/2011.13456).
diff --git a/scripts/convert_dance_diffusion_to_diffusers.py b/scripts/convert_dance_diffusion_to_diffusers.py
new file mode 100755
index 000000000000..db183bce9262
--- /dev/null
+++ b/scripts/convert_dance_diffusion_to_diffusers.py
@@ -0,0 +1,339 @@
+#!/usr/bin/env python3
+import argparse
+import math
+import os
+from copy import deepcopy
+
+import torch
+from torch import nn
+
+from audio_diffusion.models import DiffusionAttnUnet1D
+from diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel
+from diffusion import sampling
+
+
+MODELS_MAP = {
+    "gwf-440k": {
+        "url": "https://model-server.zqevans2.workers.dev/gwf-440k.ckpt",
+        "sample_rate": 48000,
+        "sample_size": 65536,
+    },
+    "jmann-small-190k": {
+        "url": "https://model-server.zqevans2.workers.dev/jmann-small-190k.ckpt",
+        "sample_rate": 48000,
+        "sample_size": 65536,
+    },
+    "jmann-large-580k": {
+        "url": "https://model-server.zqevans2.workers.dev/jmann-large-580k.ckpt",
+        "sample_rate": 48000,
+        "sample_size": 131072,
+    },
+    "maestro-uncond-150k": {
+        "url": "https://model-server.zqevans2.workers.dev/maestro-uncond-150k.ckpt",
+        "sample_rate": 16000,
+        "sample_size": 65536,
+    },
+    "unlocked-uncond-250k": {
+        "url": "https://model-server.zqevans2.workers.dev/unlocked-uncond-250k.ckpt",
+        "sample_rate": 16000,
+        "sample_size": 65536,
+    },
+    "honk-140k": {
+        "url": "https://model-server.zqevans2.workers.dev/honk-140k.ckpt",
+        "sample_rate": 16000,
+        "sample_size": 65536,
+    },
+}
+
+
+def alpha_sigma_to_t(alpha, sigma):
+    """Returns a timestep, given the scaling factors for the clean image and for
+    the noise."""
+    return torch.atan2(sigma, alpha) / math.pi * 2
+
+
+def get_crash_schedule(t):
+    sigma = torch.sin(t * math.pi / 2) ** 2
+    alpha = (1 - sigma**2) ** 0.5
+    return alpha_sigma_to_t(alpha, sigma)
+
+
+class Object(object):
+    pass
+
+
+class DiffusionUncond(nn.Module):
+    def __init__(self, global_args):
+        super().__init__()
+
+        self.diffusion = DiffusionAttnUnet1D(global_args, n_attn_layers=4)
+        self.diffusion_ema = deepcopy(self.diffusion)
+        self.rng = torch.quasirandom.SobolEngine(1, scramble=True)
+
+
+def download(model_name):
+    url = MODELS_MAP[model_name]["url"]
+    os.system(f"wget {url} ./")
+
+    return f"./{model_name}.ckpt"
+
+
+DOWN_NUM_TO_LAYER = {
+    "1": "resnets.0",
+    "2": "attentions.0",
+    "3": "resnets.1",
+    "4": "attentions.1",
+    "5": "resnets.2",
+    "6": "attentions.2",
+}
+UP_NUM_TO_LAYER = {
+    "8": "resnets.0",
+    "9": "attentions.0",
+    "10": "resnets.1",
+    "11": "attentions.1",
+    "12": "resnets.2",
+    "13": "attentions.2",
+}
+MID_NUM_TO_LAYER = {
+    "1": "resnets.0",
+    "2": "attentions.0",
+    "3": "resnets.1",
+    "4": "attentions.1",
+    "5": "resnets.2",
+    "6": "attentions.2",
+    "8": "resnets.3",
+    "9": "attentions.3",
+    "10": "resnets.4",
+    "11": "attentions.4",
+    "12": "resnets.5",
+    "13": "attentions.5",
+}
+DEPTH_0_TO_LAYER = {
+    "0": "resnets.0",
+    "1": "resnets.1",
+    "2": "resnets.2",
+    "4": "resnets.0",
+    "5": "resnets.1",
+    "6": "resnets.2",
+}
+
+RES_CONV_MAP = {
+    "skip": "conv_skip",
+    "main.0": "conv_1",
+    "main.1": "group_norm_1",
+    "main.3": "conv_2",
+    "main.4": "group_norm_2",
+}
+
+ATTN_MAP = {
+    "norm": "group_norm",
+    "qkv_proj": ["query", "key", "value"],
+    "out_proj": ["proj_attn"],
+}
+
+
+def convert_resconv_naming(name):
+    if name.startswith("skip"):
+        return name.replace("skip", RES_CONV_MAP["skip"])
+
+    # name has to be of format main.{digit}
+    if not name.startswith("main."):
+        raise ValueError(f"ResConvBlock error with {name}")
+
+    return name.replace(name[:6], RES_CONV_MAP[name[:6]])
+
+
+def convert_attn_naming(name):
+    for key, value in ATTN_MAP.items():
+        if name.startswith(key) and not isinstance(value, list):
+            return name.replace(key, value)
+        elif name.startswith(key):
+            return [name.replace(key, v) for v in value]
+    raise ValueError(f"Attn error with {name}")
+
+
+def rename(input_string, max_depth=13):
+    string = input_string
+
+    if string.split(".")[0] == "timestep_embed":
+        return string.replace("timestep_embed", "time_proj")
+
+    depth = 0
+    if string.startswith("net.3."):
+        depth += 1
+        string = string[6:]
+    elif string.startswith("net."):
+        string = string[4:]
+
+    while string.startswith("main.7."):
+        depth += 1
+        string = string[7:]
+
+    if string.startswith("main."):
+        string = string[5:]
+
+    # mid block
+    if string[:2].isdigit():
+        layer_num = string[:2]
+        string_left = string[2:]
+    else:
+        layer_num = string[0]
+        string_left = string[1:]
+
+    if depth == max_depth:
+        new_layer = MID_NUM_TO_LAYER[layer_num]
+        prefix = "mid_block"
+    elif depth > 0 and int(layer_num) < 7:
+        new_layer = DOWN_NUM_TO_LAYER[layer_num]
+        prefix = f"down_blocks.{depth}"
+    elif depth > 0 and int(layer_num) > 7:
+        new_layer = UP_NUM_TO_LAYER[layer_num]
+        prefix = f"up_blocks.{max_depth - depth - 1}"
+    elif depth == 0:
+        new_layer = DEPTH_0_TO_LAYER[layer_num]
+        prefix = f"up_blocks.{max_depth - 1}" if int(layer_num) > 3 else "down_blocks.0"
+
+    if not string_left.startswith("."):
+        raise ValueError(f"Naming error with {input_string} and string_left: {string_left}.")
+
+    string_left = string_left[1:]
+
+    if "resnets" in new_layer:
+        string_left = convert_resconv_naming(string_left)
+    elif "attentions" in new_layer:
+        new_string_left = convert_attn_naming(string_left)
+        string_left = new_string_left
+
+    if not isinstance(string_left, list):
+        new_string = prefix + "." + new_layer + "." + string_left
+    else:
+        new_string = [prefix + "." + new_layer + "." + s for s in string_left]
+    return new_string
+
+
+def rename_orig_weights(state_dict):
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        if k.endswith("kernel"):
+            # up- and downsample layers, don't have trainable weights
+            continue
+
+        new_k = rename(k)
+
+        # check if we need to transform from Conv => Linear for attention
+        if isinstance(new_k, list):
+            new_state_dict = transform_conv_attns(new_state_dict, new_k, v)
+        else:
+            new_state_dict[new_k] = v
+
+    return new_state_dict
+
+
+def transform_conv_attns(new_state_dict, new_k, v):
+    if len(new_k) == 1:
+        if len(v.shape) == 3:
+            # weight
+            new_state_dict[new_k[0]] = v[:, :, 0]
+        else:
+            # bias
+            new_state_dict[new_k[0]] = v
+    else:
+        # qkv matrices
+        trippled_shape = v.shape[0]
+        single_shape = trippled_shape // 3
+        for i in range(3):
+            if len(v.shape) == 3:
+                new_state_dict[new_k[i]] = v[i * single_shape : (i + 1) * single_shape, :, 0]
+            else:
+                new_state_dict[new_k[i]] = v[i * single_shape : (i + 1) * single_shape]
+    return new_state_dict
+
+
+def main(args):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    model_name = args.model_path.split("/")[-1].split(".")[0]
+    if not os.path.isfile(args.model_path):
+        assert (
+            model_name == args.model_path
+        ), f"Make sure to provide one of the official model names {MODELS_MAP.keys()}"
+        args.model_path = download(model_name)
+
+    sample_rate = MODELS_MAP[model_name]["sample_rate"]
+    sample_size = MODELS_MAP[model_name]["sample_size"]
+
+    config = Object()
+    config.sample_size = sample_size
+    config.sample_rate = sample_rate
+    config.latent_dim = 0
+
+    diffusers_model = UNet1DModel(sample_size=sample_size, sample_rate=sample_rate)
+    diffusers_state_dict = diffusers_model.state_dict()
+
+    orig_model = DiffusionUncond(config)
+    orig_model.load_state_dict(torch.load(args.model_path, map_location=device)["state_dict"])
+    orig_model = orig_model.diffusion_ema.eval()
+    orig_model_state_dict = orig_model.state_dict()
+    renamed_state_dict = rename_orig_weights(orig_model_state_dict)
+
+    renamed_minus_diffusers = set(renamed_state_dict.keys()) - set(diffusers_state_dict.keys())
+    diffusers_minus_renamed = set(diffusers_state_dict.keys()) - set(renamed_state_dict.keys())
+
+    assert len(renamed_minus_diffusers) == 0, f"Problem with {renamed_minus_diffusers}"
+    assert all(k.endswith("kernel") for k in list(diffusers_minus_renamed)), f"Problem with {diffusers_minus_renamed}"
+
+    for key, value in renamed_state_dict.items():
+        assert (
+            diffusers_state_dict[key].squeeze().shape == value.squeeze().shape
+        ), f"Shape for {key} doesn't match. Diffusers: {diffusers_state_dict[key].shape} vs. {value.shape}"
+        if key == "time_proj.weight":
+            value = value.squeeze()
+
+        diffusers_state_dict[key] = value
+
+    diffusers_model.load_state_dict(diffusers_state_dict)
+
+    steps = 100
+    seed = 33
+
+    diffusers_scheduler = IPNDMScheduler(num_train_timesteps=steps)
+
+    generator = torch.manual_seed(seed)
+    noise = torch.randn([1, 2, config.sample_size], generator=generator).to(device)
+
+    t = torch.linspace(1, 0, steps + 1, device=device)[:-1]
+    step_list = get_crash_schedule(t)
+
+    pipe = DanceDiffusionPipeline(unet=diffusers_model, scheduler=diffusers_scheduler)
+
+    generator = torch.manual_seed(33)
+    audio = pipe(num_inference_steps=steps, generator=generator).audios
+
+    generated = sampling.iplms_sample(orig_model, noise, step_list, {})
+    generated = generated.clamp(-1, 1)
+
+    diff_sum = (generated - audio).abs().sum()
+    diff_max = (generated - audio).abs().max()
+
+    if args.save:
+        pipe.save_pretrained(args.checkpoint_path)
+
+    print("Diff sum", diff_sum)
+    print("Diff max", diff_max)
+
+    assert diff_max < 1e-3, f"Diff max: {diff_max} is too much :-/"
+
+    print(f"Conversion for {model_name} successful!")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.")
+    parser.add_argument(
+        "--save", default=True, type=bool, required=False, help="Whether to save the converted model or not."
+    )
+    parser.add_argument("--checkpoint_path", default=None, type=str, required=True, help="Path to the output model.")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 8a0dbc9de125..2c531cf8cee0 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -18,7 +18,7 @@
 
 if is_torch_available():
     from .modeling_utils import ModelMixin
-    from .models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel
+    from .models import AutoencoderKL, UNet1DModel, UNet2DConditionModel, UNet2DModel, VQModel
     from .optimization import (
         get_constant_schedule,
         get_constant_schedule_with_warmup,
@@ -29,10 +29,19 @@
         get_scheduler,
     )
     from .pipeline_utils import DiffusionPipeline
-    from .pipelines import DDIMPipeline, DDPMPipeline, KarrasVePipeline, LDMPipeline, PNDMPipeline, ScoreSdeVePipeline
+    from .pipelines import (
+        DanceDiffusionPipeline,
+        DDIMPipeline,
+        DDPMPipeline,
+        KarrasVePipeline,
+        LDMPipeline,
+        PNDMPipeline,
+        ScoreSdeVePipeline,
+    )
     from .schedulers import (
         DDIMScheduler,
         DDPMScheduler,
+        IPNDMScheduler,
         KarrasVeScheduler,
         PNDMScheduler,
         SchedulerMixin,
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 1242ad6fca7f..c5d53b2feb4b 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -16,6 +16,7 @@
 
 
 if is_torch_available():
+    from .unet_1d import UNet1DModel
     from .unet_2d import UNet2DModel
     from .unet_2d_condition import UNet2DConditionModel
     from .vae import AutoencoderKL, VQModel
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index 06b814e2bbcd..35715e17fc47 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -101,17 +101,28 @@ def forward(self, timesteps):
 class GaussianFourierProjection(nn.Module):
     """Gaussian Fourier embeddings for noise levels."""
 
-    def __init__(self, embedding_size: int = 256, scale: float = 1.0):
+    def __init__(
+        self, embedding_size: int = 256, scale: float = 1.0, set_W_to_weight=True, log=True, flip_sin_to_cos=False
+    ):
         super().__init__()
         self.weight = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
+        self.log = log
+        self.flip_sin_to_cos = flip_sin_to_cos
 
-        # to delete later
-        self.W = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
+        if set_W_to_weight:
+            # to delete later
+            self.W = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
 
-        self.weight = self.W
+            self.weight = self.W
 
     def forward(self, x):
-        x = torch.log(x)
+        if self.log:
+            x = torch.log(x)
+
         x_proj = x[:, None] * self.weight[None, :] * 2 * np.pi
-        out = torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
+
+        if self.flip_sin_to_cos:
+            out = torch.cat([torch.cos(x_proj), torch.sin(x_proj)], dim=-1)
+        else:
+            out = torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
         return out
diff --git a/src/diffusers/models/unet_1d.py b/src/diffusers/models/unet_1d.py
new file mode 100644
index 000000000000..73cf0785ec1e
--- /dev/null
+++ b/src/diffusers/models/unet_1d.py
@@ -0,0 +1,172 @@
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..modeling_utils import ModelMixin
+from ..utils import BaseOutput
+from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
+from .unet_1d_blocks import get_down_block, get_mid_block, get_up_block
+
+
+@dataclass
+class UNet1DOutput(BaseOutput):
+    """
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, sample_size)`):
+            Hidden states output. Output of last layer of model.
+    """
+
+    sample: torch.FloatTensor
+
+
+class UNet1DModel(ModelMixin, ConfigMixin):
+    r"""
+    UNet1DModel is a 1D UNet model that takes in a noisy sample and a timestep and returns sample shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the model (such as downloading or saving, etc.)
+
+    Parameters:
+        sample_size (`int`, *optionl*): Default length of sample. Should be adaptable at runtime.
+        in_channels (`int`, *optional*, defaults to 2): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 2): Number of channels in the output.
+        time_embedding_type (`str`, *optional*, defaults to `"fourier"`): Type of time embedding to use.
+        freq_shift (`int`, *optional*, defaults to 0): Frequency shift for fourier time embedding.
+        flip_sin_to_cos (`bool`, *optional*, defaults to :
+            obj:`False`): Whether to flip sin to cos for fourier time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to :
+            obj:`("DownBlock1D", "DownBlock1DNoSkip", "AttnDownBlock1D")`): Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to :
+            obj:`("UpBlock1D", "UpBlock1DNoSkip", "AttnUpBlock1D")`): Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to :
+            obj:`(32, 32, 64)`): Tuple of block output channels.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: int = 65536,
+        sample_rate: Optional[int] = None,
+        in_channels: int = 2,
+        out_channels: int = 2,
+        extra_in_channels: int = 0,
+        time_embedding_type: str = "fourier",
+        freq_shift: int = 0,
+        flip_sin_to_cos: bool = True,
+        use_timestep_embedding: bool = False,
+        down_block_types: Tuple[str] = ("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
+        mid_block_type: str = "UNetMidBlock1D",
+        up_block_types: Tuple[str] = ("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
+        block_out_channels: Tuple[int] = (32, 32, 64),
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+
+        # time
+        if time_embedding_type == "fourier":
+            self.time_proj = GaussianFourierProjection(
+                embedding_size=8, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = 2 * block_out_channels[0]
+        elif time_embedding_type == "positional":
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+
+        if use_timestep_embedding:
+            time_embed_dim = block_out_channels[0] * 4
+            self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+        self.out_block = None
+
+        # down
+        output_channel = in_channels
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+
+            if i == 0:
+                input_channel += extra_in_channels
+
+            down_block = get_down_block(
+                down_block_type,
+                in_channels=input_channel,
+                out_channels=output_channel,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = get_mid_block(
+            mid_block_type=mid_block_type,
+            mid_channels=block_out_channels[-1],
+            in_channels=block_out_channels[-1],
+            out_channels=None,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i + 1] if i < len(up_block_types) - 1 else out_channels
+
+            up_block = get_up_block(
+                up_block_type,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # TODO(PVP, Nathan) placeholder for RL application to be merged shortly
+        # Totally fine to add another layer with a if statement - no need for nn.Identity here
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        return_dict: bool = True,
+    ) -> Union[UNet1DOutput, Tuple]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): `(batch_size, sample_size, num_channels)` noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int): (batch) timesteps
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_1d.UNet1DOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.unet_1d.UNet1DOutput`] or `tuple`: [`~models.unet_1d.UNet1DOutput`] if `return_dict` is True,
+            otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+        """
+        # 1. time
+        if len(timestep.shape) == 0:
+            timestep = timestep[None]
+
+        timestep_embed = self.time_proj(timestep)[..., None]
+        timestep_embed = timestep_embed.repeat([1, 1, sample.shape[2]])
+
+        # 2. down
+        down_block_res_samples = ()
+        for downsample_block in self.down_blocks:
+            sample, res_samples = downsample_block(hidden_states=sample, temb=timestep_embed)
+            down_block_res_samples += res_samples
+
+        # 3. mid
+        sample = self.mid_block(sample)
+
+        # 4. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            res_samples = down_block_res_samples[-1:]
+            down_block_res_samples = down_block_res_samples[:-1]
+            sample = upsample_block(sample, res_samples)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet1DOutput(sample=sample)
diff --git a/src/diffusers/models/unet_1d_blocks.py b/src/diffusers/models/unet_1d_blocks.py
new file mode 100644
index 000000000000..9009071d1e78
--- /dev/null
+++ b/src/diffusers/models/unet_1d_blocks.py
@@ -0,0 +1,384 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+_kernels = {
+    "linear": [1 / 8, 3 / 8, 3 / 8, 1 / 8],
+    "cubic": [-0.01171875, -0.03515625, 0.11328125, 0.43359375, 0.43359375, 0.11328125, -0.03515625, -0.01171875],
+    "lanczos3": [
+        0.003689131001010537,
+        0.015056144446134567,
+        -0.03399861603975296,
+        -0.066637322306633,
+        0.13550527393817902,
+        0.44638532400131226,
+        0.44638532400131226,
+        0.13550527393817902,
+        -0.066637322306633,
+        -0.03399861603975296,
+        0.015056144446134567,
+        0.003689131001010537,
+    ],
+}
+
+
+class Downsample1d(nn.Module):
+    def __init__(self, kernel="linear", pad_mode="reflect"):
+        super().__init__()
+        self.pad_mode = pad_mode
+        kernel_1d = torch.tensor(_kernels[kernel])
+        self.pad = kernel_1d.shape[0] // 2 - 1
+        self.register_buffer("kernel", kernel_1d)
+
+    def forward(self, hidden_states):
+        hidden_states = F.pad(hidden_states, (self.pad,) * 2, self.pad_mode)
+        weight = hidden_states.new_zeros([hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]])
+        indices = torch.arange(hidden_states.shape[1], device=hidden_states.device)
+        weight[indices, indices] = self.kernel.to(weight)
+        return F.conv1d(hidden_states, weight, stride=2)
+
+
+class Upsample1d(nn.Module):
+    def __init__(self, kernel="linear", pad_mode="reflect"):
+        super().__init__()
+        self.pad_mode = pad_mode
+        kernel_1d = torch.tensor(_kernels[kernel]) * 2
+        self.pad = kernel_1d.shape[0] // 2 - 1
+        self.register_buffer("kernel", kernel_1d)
+
+    def forward(self, hidden_states):
+        hidden_states = F.pad(hidden_states, ((self.pad + 1) // 2,) * 2, self.pad_mode)
+        weight = hidden_states.new_zeros([hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]])
+        indices = torch.arange(hidden_states.shape[1], device=hidden_states.device)
+        weight[indices, indices] = self.kernel.to(weight)
+        return F.conv_transpose1d(hidden_states, weight, stride=2, padding=self.pad * 2 + 1)
+
+
+class SelfAttention1d(nn.Module):
+    def __init__(self, in_channels, n_head=1, dropout_rate=0.0):
+        super().__init__()
+        self.channels = in_channels
+        self.group_norm = nn.GroupNorm(1, num_channels=in_channels)
+        self.num_heads = n_head
+
+        self.query = nn.Linear(self.channels, self.channels)
+        self.key = nn.Linear(self.channels, self.channels)
+        self.value = nn.Linear(self.channels, self.channels)
+
+        self.proj_attn = nn.Linear(self.channels, self.channels, 1)
+
+        self.dropout = nn.Dropout(dropout_rate, inplace=True)
+
+    def transpose_for_scores(self, projection: torch.Tensor) -> torch.Tensor:
+        new_projection_shape = projection.size()[:-1] + (self.num_heads, -1)
+        # move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
+        new_projection = projection.view(new_projection_shape).permute(0, 2, 1, 3)
+        return new_projection
+
+    def forward(self, hidden_states):
+        residual = hidden_states
+        batch, channel_dim, seq = hidden_states.shape
+
+        hidden_states = self.group_norm(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        query_proj = self.query(hidden_states)
+        key_proj = self.key(hidden_states)
+        value_proj = self.value(hidden_states)
+
+        query_states = self.transpose_for_scores(query_proj)
+        key_states = self.transpose_for_scores(key_proj)
+        value_states = self.transpose_for_scores(value_proj)
+
+        scale = 1 / math.sqrt(math.sqrt(key_states.shape[-1]))
+
+        attention_scores = torch.matmul(query_states * scale, key_states.transpose(-1, -2) * scale)
+        attention_probs = torch.softmax(attention_scores, dim=-1)
+
+        # compute attention output
+        hidden_states = torch.matmul(attention_probs, value_states)
+
+        hidden_states = hidden_states.permute(0, 2, 1, 3).contiguous()
+        new_hidden_states_shape = hidden_states.size()[:-2] + (self.channels,)
+        hidden_states = hidden_states.view(new_hidden_states_shape)
+
+        # compute next hidden_states
+        hidden_states = self.proj_attn(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.dropout(hidden_states)
+
+        output = hidden_states + residual
+
+        return output
+
+
+class ResConvBlock(nn.Module):
+    def __init__(self, in_channels, mid_channels, out_channels, is_last=False):
+        super().__init__()
+        self.is_last = is_last
+        self.has_conv_skip = in_channels != out_channels
+
+        if self.has_conv_skip:
+            self.conv_skip = nn.Conv1d(in_channels, out_channels, 1, bias=False)
+
+        self.conv_1 = nn.Conv1d(in_channels, mid_channels, 5, padding=2)
+        self.group_norm_1 = nn.GroupNorm(1, mid_channels)
+        self.gelu_1 = nn.GELU()
+        self.conv_2 = nn.Conv1d(mid_channels, out_channels, 5, padding=2)
+
+        if not self.is_last:
+            self.group_norm_2 = nn.GroupNorm(1, out_channels)
+            self.gelu_2 = nn.GELU()
+
+    def forward(self, hidden_states):
+        residual = self.conv_skip(hidden_states) if self.has_conv_skip else hidden_states
+
+        hidden_states = self.conv_1(hidden_states)
+        hidden_states = self.group_norm_1(hidden_states)
+        hidden_states = self.gelu_1(hidden_states)
+        hidden_states = self.conv_2(hidden_states)
+
+        if not self.is_last:
+            hidden_states = self.group_norm_2(hidden_states)
+            hidden_states = self.gelu_2(hidden_states)
+
+        output = hidden_states + residual
+        return output
+
+
+def get_down_block(down_block_type, out_channels, in_channels):
+    if down_block_type == "DownBlock1D":
+        return DownBlock1D(out_channels=out_channels, in_channels=in_channels)
+    elif down_block_type == "AttnDownBlock1D":
+        return AttnDownBlock1D(out_channels=out_channels, in_channels=in_channels)
+    elif down_block_type == "DownBlock1DNoSkip":
+        return DownBlock1DNoSkip(out_channels=out_channels, in_channels=in_channels)
+    raise ValueError(f"{down_block_type} does not exist.")
+
+
+def get_up_block(up_block_type, in_channels, out_channels):
+    if up_block_type == "UpBlock1D":
+        return UpBlock1D(in_channels=in_channels, out_channels=out_channels)
+    elif up_block_type == "AttnUpBlock1D":
+        return AttnUpBlock1D(in_channels=in_channels, out_channels=out_channels)
+    elif up_block_type == "UpBlock1DNoSkip":
+        return UpBlock1DNoSkip(in_channels=in_channels, out_channels=out_channels)
+    raise ValueError(f"{up_block_type} does not exist.")
+
+
+def get_mid_block(mid_block_type, in_channels, mid_channels, out_channels):
+    if mid_block_type == "UNetMidBlock1D":
+        return UNetMidBlock1D(in_channels=in_channels, mid_channels=mid_channels, out_channels=out_channels)
+    raise ValueError(f"{mid_block_type} does not exist.")
+
+
+class UNetMidBlock1D(nn.Module):
+    def __init__(self, mid_channels, in_channels, out_channels=None):
+        super().__init__()
+
+        out_channels = in_channels if out_channels is None else out_channels
+
+        # there is always at least one resnet
+        self.down = Downsample1d("cubic")
+        resnets = [
+            ResConvBlock(in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+        attentions = [
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(out_channels, out_channels // 32),
+        ]
+        self.up = Upsample1d(kernel="cubic")
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states):
+        hidden_states = self.down(hidden_states)
+        for attn, resnet in zip(self.attentions, self.resnets):
+            hidden_states = resnet(hidden_states)
+            hidden_states = attn(hidden_states)
+
+        hidden_states = self.up(hidden_states)
+
+        return hidden_states
+
+
+class AttnDownBlock1D(nn.Module):
+    def __init__(self, out_channels, in_channels, mid_channels=None):
+        super().__init__()
+        mid_channels = out_channels if mid_channels is None else mid_channels
+
+        self.down = Downsample1d("cubic")
+        resnets = [
+            ResConvBlock(in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+        attentions = [
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(out_channels, out_channels // 32),
+        ]
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states, temb=None):
+        hidden_states = self.down(hidden_states)
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states)
+            hidden_states = attn(hidden_states)
+
+        return hidden_states, (hidden_states,)
+
+
+class DownBlock1D(nn.Module):
+    def __init__(self, out_channels, in_channels, mid_channels=None):
+        super().__init__()
+        mid_channels = out_channels if mid_channels is None else mid_channels
+
+        self.down = Downsample1d("cubic")
+        resnets = [
+            ResConvBlock(in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states, temb=None):
+        hidden_states = self.down(hidden_states)
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states)
+
+        return hidden_states, (hidden_states,)
+
+
+class DownBlock1DNoSkip(nn.Module):
+    def __init__(self, out_channels, in_channels, mid_channels=None):
+        super().__init__()
+        mid_channels = out_channels if mid_channels is None else mid_channels
+
+        resnets = [
+            ResConvBlock(in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states, temb=None):
+        hidden_states = torch.cat([hidden_states, temb], dim=1)
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states)
+
+        return hidden_states, (hidden_states,)
+
+
+class AttnUpBlock1D(nn.Module):
+    def __init__(self, in_channels, out_channels, mid_channels=None):
+        super().__init__()
+        mid_channels = out_channels if mid_channels is None else mid_channels
+
+        resnets = [
+            ResConvBlock(2 * in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+        attentions = [
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(out_channels, out_channels // 32),
+        ]
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.up = Upsample1d(kernel="cubic")
+
+    def forward(self, hidden_states, res_hidden_states_tuple):
+        res_hidden_states = res_hidden_states_tuple[-1]
+        hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states)
+            hidden_states = attn(hidden_states)
+
+        hidden_states = self.up(hidden_states)
+
+        return hidden_states
+
+
+class UpBlock1D(nn.Module):
+    def __init__(self, in_channels, out_channels, mid_channels=None):
+        super().__init__()
+        mid_channels = in_channels if mid_channels is None else mid_channels
+
+        resnets = [
+            ResConvBlock(2 * in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+
+        self.resnets = nn.ModuleList(resnets)
+        self.up = Upsample1d(kernel="cubic")
+
+    def forward(self, hidden_states, res_hidden_states_tuple):
+        res_hidden_states = res_hidden_states_tuple[-1]
+        hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states)
+
+        hidden_states = self.up(hidden_states)
+
+        return hidden_states
+
+
+class UpBlock1DNoSkip(nn.Module):
+    def __init__(self, in_channels, out_channels, mid_channels=None):
+        super().__init__()
+        mid_channels = in_channels if mid_channels is None else mid_channels
+
+        resnets = [
+            ResConvBlock(2 * in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels, is_last=True),
+        ]
+
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states, res_hidden_states_tuple):
+        res_hidden_states = res_hidden_states_tuple[-1]
+        hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states)
+
+        return hidden_states
diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
index b580b4ed4056..641c253c86f8 100644
--- a/src/diffusers/models/unet_2d.py
+++ b/src/diffusers/models/unet_2d.py
@@ -21,7 +21,7 @@
 from ..modeling_utils import ModelMixin
 from ..utils import BaseOutput
 from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
-from .unet_blocks import UNetMidBlock2D, get_down_block, get_up_block
+from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
 
 
 @dataclass
diff --git a/src/diffusers/models/unet_blocks.py b/src/diffusers/models/unet_2d_blocks.py
similarity index 100%
rename from src/diffusers/models/unet_blocks.py
rename to src/diffusers/models/unet_2d_blocks.py
diff --git a/src/diffusers/models/unet_blocks_flax.py b/src/diffusers/models/unet_2d_blocks_flax.py
similarity index 100%
rename from src/diffusers/models/unet_blocks_flax.py
rename to src/diffusers/models/unet_2d_blocks_flax.py
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index a65e238969dd..d271b78a6525 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -22,7 +22,7 @@
 from ..modeling_utils import ModelMixin
 from ..utils import BaseOutput, logging
 from .embeddings import TimestepEmbedding, Timesteps
-from .unet_blocks import (
+from .unet_2d_blocks import (
     CrossAttnDownBlock2D,
     CrossAttnUpBlock2D,
     DownBlock2D,
diff --git a/src/diffusers/models/unet_2d_condition_flax.py b/src/diffusers/models/unet_2d_condition_flax.py
index 5411855f79d5..f0e721826bd7 100644
--- a/src/diffusers/models/unet_2d_condition_flax.py
+++ b/src/diffusers/models/unet_2d_condition_flax.py
@@ -23,7 +23,7 @@
 from ..modeling_flax_utils import FlaxModelMixin
 from ..utils import BaseOutput
 from .embeddings_flax import FlaxTimestepEmbedding, FlaxTimesteps
-from .unet_blocks_flax import (
+from .unet_2d_blocks_flax import (
     FlaxCrossAttnDownBlock2D,
     FlaxCrossAttnUpBlock2D,
     FlaxDownBlock2D,
diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py
index f1c0fc1416ee..5f5a47dada0f 100644
--- a/src/diffusers/models/vae.py
+++ b/src/diffusers/models/vae.py
@@ -21,7 +21,7 @@
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..modeling_utils import ModelMixin
 from ..utils import BaseOutput
-from .unet_blocks import UNetMidBlock2D, get_down_block, get_up_block
+from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
 
 
 @dataclass
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 894505654e47..5c08a2db39fa 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -93,6 +93,20 @@ class ImagePipelineOutput(BaseOutput):
     images: Union[List[PIL.Image.Image], np.ndarray]
 
 
+@dataclass
+class AudioPipelineOutput(BaseOutput):
+    """
+    Output class for audio pipelines.
+
+    Args:
+        audios (`np.ndarray`)
+            List of denoised samples of shape `(batch_size, num_channels, sample_rate)`. Numpy array present the
+            denoised audio samples of the diffusion pipeline.
+    """
+
+    audios: np.ndarray
+
+
 class DiffusionPipeline(ConfigMixin):
     r"""
     Base class for all models.
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 3c4f877fa830..b3124af39077 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -2,6 +2,7 @@
 
 
 if is_torch_available():
+    from .dance_diffusion import DanceDiffusionPipeline
     from .ddim import DDIMPipeline
     from .ddpm import DDPMPipeline
     from .latent_diffusion_uncond import LDMPipeline
diff --git a/src/diffusers/pipelines/dance_diffusion/__init__.py b/src/diffusers/pipelines/dance_diffusion/__init__.py
new file mode 100644
index 000000000000..2ad34fc52aaa
--- /dev/null
+++ b/src/diffusers/pipelines/dance_diffusion/__init__.py
@@ -0,0 +1,2 @@
+# flake8: noqa
+from .pipeline_dance_diffusion import DanceDiffusionPipeline
diff --git a/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py b/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
new file mode 100644
index 000000000000..dfbb980be8d8
--- /dev/null
+++ b/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
@@ -0,0 +1,113 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+# limitations under the License.
+
+
+from typing import Optional, Tuple, Union
+
+import torch
+
+from ...pipeline_utils import AudioPipelineOutput, DiffusionPipeline
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class DanceDiffusionPipeline(DiffusionPipeline):
+    r"""
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Parameters:
+        unet ([`UNet1DModel`]): U-Net architecture to denoise the encoded image.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
+            [`IPNDMScheduler`].
+    """
+
+    def __init__(self, unet, scheduler):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        num_inference_steps: int = 100,
+        generator: Optional[torch.Generator] = None,
+        sample_length_in_s: Optional[float] = None,
+        return_dict: bool = True,
+    ) -> Union[AudioPipelineOutput, Tuple]:
+        r"""
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of audio samples to generate.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality audio sample at
+                the expense of slower inference.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipeline_utils.AudioPipelineOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~pipeline_utils.AudioPipelineOutput`] or `tuple`: [`~pipelines.utils.AudioPipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+
+        if sample_length_in_s is None:
+            sample_length_in_s = self.unet.sample_size / self.unet.sample_rate
+
+        sample_size = sample_length_in_s * self.unet.sample_rate
+
+        down_scale_factor = 2 ** len(self.unet.up_blocks)
+        if sample_size < 3 * down_scale_factor:
+            raise ValueError(
+                f"{sample_length_in_s} is too small. Make sure it's bigger or equal to"
+                f" {3 * down_scale_factor / self.unet.sample_rate}."
+            )
+
+        original_sample_size = int(sample_size)
+        if sample_size % down_scale_factor != 0:
+            sample_size = ((sample_length_in_s * self.unet.sample_rate) // down_scale_factor + 1) * down_scale_factor
+            logger.info(
+                f"{sample_length_in_s} is increased to {sample_size / self.unet.sample_rate} so that it can be handled"
+                f" by the model. It will be cut to {original_sample_size / self.unet.sample_rate} after the denoising"
+                " process."
+            )
+        sample_size = int(sample_size)
+
+        audio = torch.randn((batch_size, self.unet.in_channels, sample_size), generator=generator, device=self.device)
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps, device=audio.device)
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            # 1. predict noise model_output
+            model_output = self.unet(audio, t).sample
+
+            # 2. compute previous image: x_t -> t_t-1
+            audio = self.scheduler.step(model_output, t, audio).prev_sample
+
+        audio = audio.clamp(-1, 1).cpu().numpy()
+
+        audio = audio[:, :, :original_sample_size]
+
+        if not return_dict:
+            return (audio,)
+
+        return AudioPipelineOutput(audios=audio)
diff --git a/src/diffusers/schedulers/__init__.py b/src/diffusers/schedulers/__init__.py
index 91026b7d487f..f179acde09e2 100644
--- a/src/diffusers/schedulers/__init__.py
+++ b/src/diffusers/schedulers/__init__.py
@@ -19,6 +19,7 @@
 if is_torch_available():
     from .scheduling_ddim import DDIMScheduler
     from .scheduling_ddpm import DDPMScheduler
+    from .scheduling_ipndm import IPNDMScheduler
     from .scheduling_karras_ve import KarrasVeScheduler
     from .scheduling_pndm import PNDMScheduler
     from .scheduling_sde_ve import ScoreSdeVeScheduler
diff --git a/src/diffusers/schedulers/scheduling_ipndm.py b/src/diffusers/schedulers/scheduling_ipndm.py
new file mode 100644
index 000000000000..fb413a280530
--- /dev/null
+++ b/src/diffusers/schedulers/scheduling_ipndm.py
@@ -0,0 +1,152 @@
+# Copyright 2022 Zhejiang University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Tuple, Union
+
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils import SchedulerMixin, SchedulerOutput
+
+
+class IPNDMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Improved Pseudo numerical methods for diffusion models (iPNDM) ported from @crowsonkb's amazing k-diffusion
+    [library](https://github.com/crowsonkb/v-diffusion-pytorch/blob/987f8985e38208345c1959b0ea767a625831cc9b/diffusion/sampling.py#L296)
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+    [`~ConfigMixin.from_config`] functions.
+
+    For more details, see the original paper: https://arxiv.org/abs/2202.09778
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+    """
+
+    @register_to_config
+    def __init__(self, num_train_timesteps: int = 1000):
+        # set `betas`, `alphas`, `timesteps`
+        self.set_timesteps(num_train_timesteps)
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # For now we only support F-PNDM, i.e. the runge-kutta method
+        # For more information on the algorithm please take a look at the paper: https://arxiv.org/pdf/2202.09778.pdf
+        # mainly at formula (9), (12), (13) and the Algorithm 2.
+        self.pndm_order = 4
+
+        # running values
+        self.ets = []
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+        """
+        self.num_inference_steps = num_inference_steps
+        steps = torch.linspace(1, 0, num_inference_steps + 1)[:-1]
+        steps = torch.cat([steps, torch.tensor([0.0])])
+
+        self.betas = torch.sin(steps * math.pi / 2) ** 2
+        self.alphas = (1.0 - self.betas**2) ** 0.5
+
+        timesteps = (torch.atan2(self.betas, self.alphas) / math.pi * 2)[:-1]
+        self.timesteps = timesteps.to(device)
+
+        self.ets = []
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Step function propagating the sample with the linear multi-step method. This has one forward pass with multiple
+        times to approximate the solution.
+
+        Args:
+            model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+            return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
+
+        Returns:
+            [`~scheduling_utils.SchedulerOutput`] or `tuple`: [`~scheduling_utils.SchedulerOutput`] if `return_dict` is
+            True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        timestep_index = (self.timesteps == timestep).nonzero().item()
+        prev_timestep_index = timestep_index + 1
+
+        ets = sample * self.betas[timestep_index] + model_output * self.alphas[timestep_index]
+        self.ets.append(ets)
+
+        if len(self.ets) == 1:
+            ets = self.ets[-1]
+        elif len(self.ets) == 2:
+            ets = (3 * self.ets[-1] - self.ets[-2]) / 2
+        elif len(self.ets) == 3:
+            ets = (23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12
+        else:
+            ets = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + 37 * self.ets[-3] - 9 * self.ets[-4])
+
+        prev_sample = self._get_prev_sample(sample, timestep_index, prev_timestep_index, ets)
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`): input sample
+
+        Returns:
+            `torch.FloatTensor`: scaled input sample
+        """
+        return sample
+
+    def _get_prev_sample(self, sample, timestep_index, prev_timestep_index, ets):
+        alpha = self.alphas[timestep_index]
+        sigma = self.betas[timestep_index]
+
+        next_alpha = self.alphas[prev_timestep_index]
+        next_sigma = self.betas[prev_timestep_index]
+
+        pred = (sample - sigma * ets) / max(alpha, 1e-8)
+        prev_sample = next_alpha * pred + ets * next_sigma
+
+        return prev_sample
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index ee748f5b1d6a..93834c76550b 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -34,6 +34,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class UNet1DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class UNet2DConditionModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -122,6 +137,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class DanceDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class DDIMPipeline(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -242,6 +272,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class IPNDMScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class KarrasVeScheduler(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
new file mode 100644
index 000000000000..b1aa33fcff7e
--- /dev/null
+++ b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
@@ -0,0 +1,101 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel
+from diffusers.utils import slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class PipelineFastTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @property
+    def dummy_unet(self):
+        torch.manual_seed(0)
+        model = UNet1DModel(
+            block_out_channels=(32, 32, 64),
+            extra_in_channels=16,
+            sample_size=512,
+            sample_rate=16_000,
+            in_channels=2,
+            out_channels=2,
+            down_block_types=["DownBlock1DNoSkip"] + ["DownBlock1D"] + ["AttnDownBlock1D"],
+            up_block_types=["AttnUpBlock1D"] + ["UpBlock1D"] + ["UpBlock1DNoSkip"],
+        )
+        return model
+
+    def test_dance_diffusion(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        scheduler = IPNDMScheduler()
+
+        pipe = DanceDiffusionPipeline(unet=self.dummy_unet, scheduler=scheduler)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = pipe(generator=generator, num_inference_steps=4)
+        audio = output.audios
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = pipe(generator=generator, num_inference_steps=4, return_dict=False)
+        audio_from_tuple = output[0]
+
+        audio_slice = audio[0, -3:, -3:]
+        audio_from_tuple_slice = audio_from_tuple[0, -3:, -3:]
+
+        assert audio.shape == (1, 2, self.dummy_unet.sample_size)
+        expected_slice = np.array([-0.7265, 1.0000, -0.8388, 0.1175, 0.9498, -1.0000])
+        assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(audio_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+
+@slow
+@require_torch_gpu
+class PipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_dance_diffusion(self):
+        device = torch_device
+
+        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k")
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = pipe(generator=generator, num_inference_steps=100, sample_length_in_s=4.096)
+        audio = output.audios
+
+        audio_slice = audio[0, -3:, -3:]
+
+        assert audio.shape == (1, 2, pipe.unet.sample_size)
+        expected_slice = np.array([-0.1576, -0.1526, -0.127, -0.2699, -0.2762, -0.2487])
+        assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/test_models_unet_1d.py b/tests/test_models_unet_1d.py
new file mode 100644
index 000000000000..c274ce419211
--- /dev/null
+++ b/tests/test_models_unet_1d.py
@@ -0,0 +1,45 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import UNet1DModel
+from diffusers.utils import slow, torch_device
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class UnetModel1DTests(unittest.TestCase):
+    @slow
+    def test_unet_1d_maestro(self):
+        model_id = "harmonai/maestro-150k"
+        model = UNet1DModel.from_pretrained(model_id, subfolder="unet")
+        model.to(torch_device)
+
+        sample_size = 65536
+        noise = torch.sin(torch.arange(sample_size)[None, None, :].repeat(1, 2, 1)).to(torch_device)
+        timestep = torch.tensor([1]).to(torch_device)
+
+        with torch.no_grad():
+            output = model(noise, timestep).sample
+
+        output_sum = output.abs().sum()
+        output_max = output.abs().max()
+
+        assert (output_sum - 224.0896).abs() < 4e-2
+        assert (output_max - 0.0607).abs() < 4e-4
diff --git a/tests/test_models_unet.py b/tests/test_models_unet_2d.py
similarity index 99%
rename from tests/test_models_unet.py
rename to tests/test_models_unet_2d.py
index 7a16a8592644..393d47887e6d 100644
--- a/tests/test_models_unet.py
+++ b/tests/test_models_unet_2d.py
@@ -29,7 +29,7 @@
 torch.backends.cuda.matmul.allow_tf32 = False
 
 
-class UnetModelTests(ModelTesterMixin, unittest.TestCase):
+class Unet2DModelTests(ModelTesterMixin, unittest.TestCase):
     model_class = UNet2DModel
 
     @property
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index c3d4b9bc76f9..899870a74a3f 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -19,7 +19,14 @@
 import numpy as np
 import torch
 
-from diffusers import DDIMScheduler, DDPMScheduler, LMSDiscreteScheduler, PNDMScheduler, ScoreSdeVeScheduler
+from diffusers import (
+    DDIMScheduler,
+    DDPMScheduler,
+    IPNDMScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    ScoreSdeVeScheduler,
+)
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
@@ -203,6 +210,10 @@ def recursive_check(tuple_object, dict_object):
         kwargs = dict(self.forward_default_kwargs)
         num_inference_steps = kwargs.pop("num_inference_steps", 50)
 
+        timestep = 0
+        if len(self.scheduler_classes) > 0 and self.scheduler_classes[0] == IPNDMScheduler:
+            timestep = 1
+
         for scheduler_class in self.scheduler_classes:
             scheduler_config = self.get_scheduler_config()
             scheduler = scheduler_class(**scheduler_config)
@@ -215,14 +226,14 @@ def recursive_check(tuple_object, dict_object):
             elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
-            outputs_dict = scheduler.step(residual, 0, sample, **kwargs)
+            outputs_dict = scheduler.step(residual, timestep, sample, **kwargs)
 
             if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
             elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
-            outputs_tuple = scheduler.step(residual, 0, sample, return_dict=False, **kwargs)
+            outputs_tuple = scheduler.step(residual, timestep, sample, return_dict=False, **kwargs)
 
             recursive_check(outputs_tuple, outputs_dict)
 
@@ -901,3 +912,157 @@ def test_full_loop_no_noise(self):
 
         assert abs(result_sum.item() - 1006.388) < 1e-2
         assert abs(result_mean.item() - 1.31) < 1e-3
+
+
+class IPNDMSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (IPNDMScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 50),)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {"num_train_timesteps": 1000}
+        config.update(**kwargs)
+        return config
+
+    def check_over_configs(self, time_step=0, **config):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+            # copy over dummy past residuals
+            scheduler.ets = dummy_past_residuals[:]
+
+            if time_step is None:
+                time_step = scheduler.timesteps[len(scheduler.timesteps) // 2]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_config(tmpdirname)
+                new_scheduler.set_timesteps(num_inference_steps)
+                # copy over dummy past residuals
+                new_scheduler.ets = dummy_past_residuals[:]
+
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_from_pretrained_save_pretrained(self):
+        pass
+
+    def check_over_forward(self, time_step=0, **forward_kwargs):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+
+            # copy over dummy past residuals (must be after setting timesteps)
+            scheduler.ets = dummy_past_residuals[:]
+
+            if time_step is None:
+                time_step = scheduler.timesteps[len(scheduler.timesteps) // 2]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_config(tmpdirname)
+                # copy over dummy past residuals
+                new_scheduler.set_timesteps(num_inference_steps)
+
+                # copy over dummy past residual (must be after setting timesteps)
+                new_scheduler.ets = dummy_past_residuals[:]
+
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def full_loop(self, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        scheduler.set_timesteps(num_inference_steps)
+
+        for i, t in enumerate(scheduler.timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        for i, t in enumerate(scheduler.timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        return sample
+
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            sample = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                scheduler.set_timesteps(num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            # copy over dummy past residuals (must be done after set_timesteps)
+            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
+            scheduler.ets = dummy_past_residuals[:]
+
+            time_step_0 = scheduler.timesteps[5]
+            time_step_1 = scheduler.timesteps[6]
+
+            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+    def test_timesteps(self):
+        for timesteps in [100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps, time_step=None)
+
+    def test_inference_steps(self):
+        for t, num_inference_steps in zip([1, 5, 10], [10, 50, 100]):
+            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=None)
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 2540529) < 10

From 365ff8f76d9dfc2086625b1c265b920be2218863 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 25 Oct 2022 19:33:43 +0200
Subject: [PATCH 044/269] [Dance Diffusion] FP16 (#980)

* add in fp16

* up
---
 src/diffusers/models/unet_1d.py                 |  2 +-
 .../dance_diffusion/pipeline_dance_diffusion.py |  8 ++++++--
 .../dance_diffusion/test_dance_diffusion.py     | 17 +++++++++++++++++
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/models/unet_1d.py b/src/diffusers/models/unet_1d.py
index 73cf0785ec1e..cc0685deb933 100644
--- a/src/diffusers/models/unet_1d.py
+++ b/src/diffusers/models/unet_1d.py
@@ -149,7 +149,7 @@ def forward(
             timestep = timestep[None]
 
         timestep_embed = self.time_proj(timestep)[..., None]
-        timestep_embed = timestep_embed.repeat([1, 1, sample.shape[2]])
+        timestep_embed = timestep_embed.repeat([1, 1, sample.shape[2]]).to(sample.dtype)
 
         # 2. down
         down_block_res_samples = ()
diff --git a/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py b/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
index dfbb980be8d8..643ef3d2d615 100644
--- a/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
+++ b/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
@@ -91,10 +91,14 @@ def __call__(
             )
         sample_size = int(sample_size)
 
-        audio = torch.randn((batch_size, self.unet.in_channels, sample_size), generator=generator, device=self.device)
+        dtype = next(iter(self.unet.parameters())).dtype
+        audio = torch.randn(
+            (batch_size, self.unet.in_channels, sample_size), generator=generator, device=self.device, dtype=dtype
+        )
 
         # set step values
         self.scheduler.set_timesteps(num_inference_steps, device=audio.device)
+        self.scheduler.timesteps = self.scheduler.timesteps.to(dtype)
 
         for t in self.progress_bar(self.scheduler.timesteps):
             # 1. predict noise model_output
@@ -103,7 +107,7 @@ def __call__(
             # 2. compute previous image: x_t -> t_t-1
             audio = self.scheduler.step(model_output, t, audio).prev_sample
 
-        audio = audio.clamp(-1, 1).cpu().numpy()
+        audio = audio.clamp(-1, 1).float().cpu().numpy()
 
         audio = audio[:, :, :original_sample_size]
 
diff --git a/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
index b1aa33fcff7e..bdf57ec08f54 100644
--- a/tests/pipelines/dance_diffusion/test_dance_diffusion.py
+++ b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
@@ -99,3 +99,20 @@ def test_dance_diffusion(self):
         assert audio.shape == (1, 2, pipe.unet.sample_size)
         expected_slice = np.array([-0.1576, -0.1526, -0.127, -0.2699, -0.2762, -0.2487])
         assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_dance_diffusion_fp16(self):
+        device = torch_device
+
+        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k", torch_dtype=torch.float16)
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = pipe(generator=generator, num_inference_steps=100, sample_length_in_s=4.096)
+        audio = output.audios
+
+        audio_slice = audio[0, -3:, -3:]
+
+        assert audio.shape == (1, 2, pipe.unet.sample_size)
+        expected_slice = np.array([-0.1693, -0.1698, -0.1447, -0.3044, -0.3203, -0.2937])
+        assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2

From 59f0ce82eb5c7684e3cfd294005a91c56a429acd Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 25 Oct 2022 19:52:41 +0200
Subject: [PATCH 045/269] [Dance Diffusion] Better naming (#981)

uP
---
 .../dance_diffusion/pipeline_dance_diffusion.py | 17 ++++++++++-------
 .../dance_diffusion/test_dance_diffusion.py     |  4 ++--
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py b/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
index 643ef3d2d615..86f01ec58616 100644
--- a/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
+++ b/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
@@ -47,7 +47,7 @@ def __call__(
         batch_size: int = 1,
         num_inference_steps: int = 100,
         generator: Optional[torch.Generator] = None,
-        sample_length_in_s: Optional[float] = None,
+        audio_length_in_s: Optional[float] = None,
         return_dict: bool = True,
     ) -> Union[AudioPipelineOutput, Tuple]:
         r"""
@@ -60,6 +60,9 @@ def __call__(
             generator (`torch.Generator`, *optional*):
                 A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
                 deterministic.
+            audio_length_in_s (`float`, *optional*, defaults to `self.unet.config.sample_size/self.unet.config.sample_rate`):
+                The length of the generated audio sample in seconds. Note that the output of the pipeline, *i.e.*
+                `sample_size`, will be `audio_length_in_s` * `self.unet.sample_rate`.
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipeline_utils.AudioPipelineOutput`] instead of a plain tuple.
 
@@ -69,23 +72,23 @@ def __call__(
             generated images.
         """
 
-        if sample_length_in_s is None:
-            sample_length_in_s = self.unet.sample_size / self.unet.sample_rate
+        if audio_length_in_s is None:
+            audio_length_in_s = self.unet.config.sample_size / self.unet.config.sample_rate
 
-        sample_size = sample_length_in_s * self.unet.sample_rate
+        sample_size = audio_length_in_s * self.unet.sample_rate
 
         down_scale_factor = 2 ** len(self.unet.up_blocks)
         if sample_size < 3 * down_scale_factor:
             raise ValueError(
-                f"{sample_length_in_s} is too small. Make sure it's bigger or equal to"
+                f"{audio_length_in_s} is too small. Make sure it's bigger or equal to"
                 f" {3 * down_scale_factor / self.unet.sample_rate}."
             )
 
         original_sample_size = int(sample_size)
         if sample_size % down_scale_factor != 0:
-            sample_size = ((sample_length_in_s * self.unet.sample_rate) // down_scale_factor + 1) * down_scale_factor
+            sample_size = ((audio_length_in_s * self.unet.sample_rate) // down_scale_factor + 1) * down_scale_factor
             logger.info(
-                f"{sample_length_in_s} is increased to {sample_size / self.unet.sample_rate} so that it can be handled"
+                f"{audio_length_in_s} is increased to {sample_size / self.unet.sample_rate} so that it can be handled"
                 f" by the model. It will be cut to {original_sample_size / self.unet.sample_rate} after the denoising"
                 " process."
             )
diff --git a/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
index bdf57ec08f54..72e67e4479d2 100644
--- a/tests/pipelines/dance_diffusion/test_dance_diffusion.py
+++ b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
@@ -91,7 +91,7 @@ def test_dance_diffusion(self):
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device=device).manual_seed(0)
-        output = pipe(generator=generator, num_inference_steps=100, sample_length_in_s=4.096)
+        output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
         audio = output.audios
 
         audio_slice = audio[0, -3:, -3:]
@@ -108,7 +108,7 @@ def test_dance_diffusion_fp16(self):
         pipe.set_progress_bar_config(disable=None)
 
         generator = torch.Generator(device=device).manual_seed(0)
-        output = pipe(generator=generator, num_inference_steps=100, sample_length_in_s=4.096)
+        output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
         audio = output.audios
 
         audio_slice = audio[0, -3:, -3:]

From e2243de5f23a2dcae19e7583df6875b2513c9ac6 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 25 Oct 2022 20:20:16 +0200
Subject: [PATCH 046/269] Fix typo in documentation title (#975)

---
 docs/source/_toctree.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 43a4fc4bf11e..295e24674854 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -36,7 +36,7 @@
   - local: optimization/onnx
     title: "ONNX"
   - local: optimization/open_vino
-    title: "Open Vino"
+    title: "OpenVINO"
   - local: optimization/mps
     title: "MPS"
   title: "Optimization/Special Hardware"

From 4b9f58952a5c9cbd7db71cf050dc7d3961505730 Mon Sep 17 00:00:00 2001
From: Yuta Hayashibe <shirayu@users.noreply.github.com>
Date: Wed, 26 Oct 2022 04:38:23 +0900
Subject: [PATCH 047/269] Add --pretrained_model_name_revision option to
 train_dreambooth.py (#933)

* Add --pretrained_model_name_revision option to train_dreambooth.py

* Renamed --pretrained_model_name_revision to --revision
---
 examples/dreambooth/train_dreambooth.py | 42 +++++++++++++++++++++----
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index 635166fb4081..c28bcbd2d762 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -35,6 +35,13 @@ def parse_args():
         required=True,
         help="Path to pretrained model or model identifier from huggingface.co/models.",
     )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
     parser.add_argument(
         "--tokenizer_name",
         type=str,
@@ -344,7 +351,10 @@ def main():
         if cur_class_images < args.num_class_images:
             torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
             pipeline = StableDiffusionPipeline.from_pretrained(
-                args.pretrained_model_name_or_path, torch_dtype=torch_dtype, safety_checker=None
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                safety_checker=None,
+                revision=args.revision,
             )
             pipeline.set_progress_bar_config(disable=True)
 
@@ -390,14 +400,33 @@ def main():
 
     # Load the tokenizer
     if args.tokenizer_name:
-        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
+        tokenizer = CLIPTokenizer.from_pretrained(
+            args.tokenizer_name,
+            revision=args.revision,
+        )
     elif args.pretrained_model_name_or_path:
-        tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+        tokenizer = CLIPTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+        )
 
     # Load models and create wrapper for stable diffusion
-    text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
-    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
-    unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=args.revision,
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="vae",
+        revision=args.revision,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="unet",
+        revision=args.revision,
+    )
 
     vae.requires_grad_(False)
     if not args.train_text_encoder:
@@ -613,6 +642,7 @@ def collate_fn(examples):
             args.pretrained_model_name_or_path,
             unet=accelerator.unwrap_model(unet),
             text_encoder=accelerator.unwrap_model(text_encoder),
+            revision=args.revision,
         )
         pipeline.save_pretrained(args.output_dir)
 

From 0343d8f531d6736b2c6eb0e35eb34edd28461294 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Wed, 26 Oct 2022 11:56:43 +0200
Subject: [PATCH 048/269] Do not use torch.float64 on the mps device (#942)

* Add failing test for #940.

* Do not use torch.float64 in mps.

* style

* Temporarily skip add_noise for IPNDMScheduler.

Until #990 is addressed.
---
 .../schedulers/scheduling_lms_discrete.py      |  3 ++-
 tests/test_scheduler.py                        | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py
index 768413e9e6a3..8b371703ec84 100644
--- a/src/diffusers/schedulers/scheduling_lms_discrete.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete.py
@@ -252,7 +252,8 @@ def add_noise(
     ) -> torch.FloatTensor:
         # Make sure sigmas and timesteps have the same device and dtype as original_samples
         self.sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
-        self.timesteps = self.timesteps.to(original_samples.device)
+        dtype = torch.float32 if original_samples.device.type == "mps" else timesteps.dtype
+        self.timesteps = self.timesteps.to(original_samples.device, dtype=dtype)
         timesteps = timesteps.to(original_samples.device)
 
         schedule_timesteps = self.timesteps
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 899870a74a3f..6f86ffc85e05 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -27,6 +27,7 @@
     PNDMScheduler,
     ScoreSdeVeScheduler,
 )
+from diffusers.utils import torch_device
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
@@ -258,6 +259,23 @@ def test_scheduler_public_api(self):
             scaled_sample = scheduler.scale_model_input(sample, 0.0)
             self.assertEqual(sample.shape, scaled_sample.shape)
 
+    def test_add_noise_device(self):
+        for scheduler_class in self.scheduler_classes:
+            if scheduler_class == IPNDMScheduler:
+                # Skip until #990 is addressed
+                continue
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            sample = self.dummy_sample.to(torch_device)
+            scaled_sample = scheduler.scale_model_input(sample, 0.0)
+            self.assertEqual(sample.shape, scaled_sample.shape)
+
+            noise = torch.randn_like(scaled_sample).to(torch_device)
+            t = torch.tensor([10]).to(torch_device)
+            noised = scheduler.add_noise(scaled_sample, noise, t)
+            self.assertEqual(noised.shape, scaled_sample.shape)
+
 
 class DDPMSchedulerTest(SchedulerCommonTest):
     scheduler_classes = (DDPMScheduler,)

From d9cfe325a53502641f16ce4f839391c5b0d0a684 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 26 Oct 2022 12:32:07 +0200
Subject: [PATCH 049/269] CompVis -> diffusers script - allow converting from
 merged checkpoint to either EMA or non-EMA (#991)

* improve script

* up
---
 ..._original_stable_diffusion_to_diffusers.py | 40 +++++++++++++++++--
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py
index db1b30736984..46073001f126 100644
--- a/scripts/convert_original_stable_diffusion_to_diffusers.py
+++ b/scripts/convert_original_stable_diffusion_to_diffusers.py
@@ -285,15 +285,34 @@ def create_ldm_bert_config(original_config):
     return config
 
 
-def convert_ldm_unet_checkpoint(checkpoint, config):
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
     """
     Takes a state dict and a config, and returns a converted checkpoint.
     """
 
     # extract state_dict for UNet
     unet_state_dict = {}
-    unet_key = "model.diffusion_model."
     keys = list(checkpoint.keys())
+
+    unet_key = "model.diffusion_model."
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100:
+        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        if extract_ema:
+            print(
+                "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+                " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+            )
+            for key in keys:
+                if key.startswith("model.diffusion_model"):
+                    flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+        else:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+
     for key in keys:
         if key.startswith(unet_key):
             unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
@@ -630,6 +649,15 @@ def convert_ldm_clip_checkpoint(checkpoint):
         type=str,
         help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim']",
     )
+    parser.add_argument(
+        "--extract_ema",
+        action="store_true",
+        help=(
+            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
+            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
+            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
+        ),
+    )
     parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
 
     args = parser.parse_args()
@@ -641,7 +669,9 @@ def convert_ldm_clip_checkpoint(checkpoint):
         args.original_config_file = "./v1-inference.yaml"
 
     original_config = OmegaConf.load(args.original_config_file)
-    checkpoint = torch.load(args.checkpoint_path)["state_dict"]
+
+    checkpoint = torch.load(args.checkpoint_path)
+    checkpoint = checkpoint["state_dict"]
 
     num_train_timesteps = original_config.model.params.timesteps
     beta_start = original_config.model.params.linear_start
@@ -669,7 +699,9 @@ def convert_ldm_clip_checkpoint(checkpoint):
 
     # Convert the UNet2DConditionModel model.
     unet_config = create_unet_diffusers_config(original_config)
-    converted_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, unet_config)
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint, unet_config, path=args.checkpoint_path, extract_ema=args.extract_ema
+    )
 
     unet = UNet2DConditionModel(**unet_config)
     unet.load_state_dict(converted_unet_checkpoint)

From d7d6841406a2cef52da26fc58342e543b5cd9e1d Mon Sep 17 00:00:00 2001
From: Hu Ye <xiaohuzc@gmail.com>
Date: Wed, 26 Oct 2022 20:26:17 +0800
Subject: [PATCH 050/269] fix a bug in the new version (#957)

remove tensor_format in the new version
---
 examples/text_to_image/train_text_to_image.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py
index c3c3c75438d0..859a891ecf37 100644
--- a/examples/text_to_image/train_text_to_image.py
+++ b/examples/text_to_image/train_text_to_image.py
@@ -375,7 +375,7 @@ def main():
 
     # TODO (patil-suraj): load scheduler using args
     noise_scheduler = DDPMScheduler(
-        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000, tensor_format="pt"
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
     )
 
     # Get the datasets: you can either provide your own training and evaluation files (see below)

From cc436087d37d6028ff85acab9a957778d0d23e39 Mon Sep 17 00:00:00 2001
From: Yuta Hayashibe <shirayu@users.noreply.github.com>
Date: Wed, 26 Oct 2022 22:32:47 +0900
Subject: [PATCH 051/269] Fix typos (#978)

---
 examples/community/README.md                      | 2 +-
 examples/community/composable_stable_diffusion.py | 4 ++--
 examples/community/lpw_stable_diffusion.py        | 6 +++---
 examples/community/lpw_stable_diffusion_onnx.py   | 6 +++---
 src/diffusers/pipeline_utils.py                   | 4 ++--
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/community/README.md b/examples/community/README.md
index 8f70e004e5fd..3f92f202d8c4 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -355,7 +355,7 @@ generator = th.Generator("cuda").manual_seed(0)
 
 seed = 0
 prompt = "a forest | a camel"
-weights = " 1 | 1"  # Equal weight to each prompt. Cna be negative
+weights = " 1 | 1"  # Equal weight to each prompt. Can be negative
 
 images = []
 for i in range(4):
diff --git a/examples/community/composable_stable_diffusion.py b/examples/community/composable_stable_diffusion.py
index 308c18287e02..10d34d255a20 100644
--- a/examples/community/composable_stable_diffusion.py
+++ b/examples/community/composable_stable_diffusion.py
@@ -133,7 +133,7 @@ def __call__(
                 tensor will ge generated by sampling using the supplied random `generator`.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `nd.array`.
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                 plain tuple.
@@ -264,7 +264,7 @@ def __call__(
 
         self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
 
-        # if we use LMSDiscreteScheduler, let's make sure latents are mulitplied by sigmas
+        # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas
         if isinstance(self.scheduler, LMSDiscreteScheduler):
             latents = latents * self.scheduler.sigmas[0]
 
diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py
index 4980f8c8be06..1b2279de720c 100644
--- a/examples/community/lpw_stable_diffusion.py
+++ b/examples/community/lpw_stable_diffusion.py
@@ -40,7 +40,7 @@
 
 def parse_prompt_attention(text):
     """
-    Parses a string with attention tokens and returns a list of pairs: text and its assoicated weight.
+    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
     Accepted tokens are:
       (abc) - increases attention to abc by a multiplier of 1.1
       (abc:3.12) - increases attention to abc by a multiplier of 3.12
@@ -237,9 +237,9 @@ def get_weighted_text_embeddings(
     r"""
     Prompts can be assigned with local weights using brackets. For example,
     prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
-    and the embedding tokens corresponding to the words get multipled by a constant, 1.1.
+    and the embedding tokens corresponding to the words get multiplied by a constant, 1.1.
 
-    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the origional mean.
+    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
 
     Args:
         pipe (`DiffusionPipeline`):
diff --git a/examples/community/lpw_stable_diffusion_onnx.py b/examples/community/lpw_stable_diffusion_onnx.py
index 4ca37c0c4ad4..37f03c86f29d 100644
--- a/examples/community/lpw_stable_diffusion_onnx.py
+++ b/examples/community/lpw_stable_diffusion_onnx.py
@@ -38,7 +38,7 @@
 
 def parse_prompt_attention(text):
     """
-    Parses a string with attention tokens and returns a list of pairs: text and its assoicated weight.
+    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
     Accepted tokens are:
       (abc) - increases attention to abc by a multiplier of 1.1
       (abc:3.12) - increases attention to abc by a multiplier of 3.12
@@ -236,9 +236,9 @@ def get_weighted_text_embeddings(
     r"""
     Prompts can be assigned with local weights using brackets. For example,
     prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
-    and the embedding tokens corresponding to the words get multipled by a constant, 1.1.
+    and the embedding tokens corresponding to the words get multiplied by a constant, 1.1.
 
-    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the origional mean.
+    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
 
     Args:
         pipe (`DiffusionPipeline`):
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 5c08a2db39fa..d307bd5a076e 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -584,7 +584,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
     def components(self) -> Dict[str, Any]:
         r"""
 
-        The `self.compenents` property can be useful to run different pipelines with the same weights and
+        The `self.components` property can be useful to run different pipelines with the same weights and
         configurations to not have to re-allocate memory.
 
         Examples:
@@ -602,7 +602,7 @@ def components(self) -> Dict[str, Any]:
         ```
 
         Returns:
-            A dictionaly containing all the modules needed to initialize the pipleline.
+            A dictionaly containing all the modules needed to initialize the pipeline.
         """
         components = {k: getattr(self, k) for k in self.config.keys() if not k.startswith("_")}
         expected_modules = set(inspect.signature(self.__init__).parameters.keys()) - set(["self"])

From 2f0fcf4fa82c21fcbe56fac715be64766b6606f4 Mon Sep 17 00:00:00 2001
From: Julien Simon <julien@julien.org>
Date: Wed, 26 Oct 2022 06:45:39 -0700
Subject: [PATCH 052/269] Add missing import (#979)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index a5f47dfcb032..ac518fb8ec76 100644
--- a/README.md
+++ b/README.md
@@ -88,6 +88,7 @@ We recommend using the model in [half-precision (`fp16`)](https://pytorch.org/bl
 precision while being roughly twice as fast and requiring half the amount of GPU RAM.
 
 ```python
+import torch
 from diffusers import StableDiffusionPipeline
 
 pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, revision="fp16")

From b2e2d1411ce394ef15c41aafb34b3c08beedff0f Mon Sep 17 00:00:00 2001
From: Pi Esposito <piero.skywalker@gmail.com>
Date: Wed, 26 Oct 2022 10:52:57 -0300
Subject: [PATCH 053/269] minimal stable diffusion GPU memory usage with
 accelerate hooks (#850)

* add method to enable cuda with minimal gpu usage to stable diffusion

* add test to minimal cuda memory usage

* ensure all models but unet are onn torch.float32

* move to cpu_offload along with minor internal changes to make it work

* make it test against accelerate master branch

* coming back, its official: I don't know how to make it test againt the master branch from accelerate

* make it install accelerate from master on tests

* go back to accelerate>=0.11

* undo prettier formatting on yml files

* undo prettier formatting on yml files againn
---
 .github/workflows/pr_tests.yml                |  2 ++
 .github/workflows/push_tests.yml              |  4 ++--
 src/diffusers/pipeline_utils.py               |  2 ++
 .../pipeline_stable_diffusion.py              | 13 ++++++++++++
 tests/test_pipelines.py                       | 20 +++++++++++++++++++
 5 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
index cf21edf99165..81c75fecec6e 100644
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -34,6 +34,7 @@ jobs:
         python -m pip install --upgrade pip
         python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
         python -m pip install -e .[quality,test]
+        python -m pip install git+https://github.com/huggingface/accelerate
 
     - name: Environment
       run: |
@@ -80,6 +81,7 @@ jobs:
         ${CONDA_RUN} python -m pip install --upgrade pip
         ${CONDA_RUN} python -m pip install -e .[quality,test]
         ${CONDA_RUN} python -m pip install --pre torch==${MPS_TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/test/cpu
+        ${CONDA_RUN} python -m pip install git+https://github.com/huggingface/accelerate
 
     - name: Environment
       shell: arch -arch arm64 bash {0}
diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index 3e4a81c91c01..dfd83aa9af46 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -36,6 +36,7 @@ jobs:
         python -m pip uninstall -y torch torchvision torchtext
         python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu116
         python -m pip install -e .[quality,test]
+        python -m pip install git+https://github.com/huggingface/accelerate
 
     - name: Environment
       run: |
@@ -58,8 +59,6 @@ jobs:
         name: torch_test_reports
         path: reports
 
-
-
   run_examples_single_gpu:
     name: Examples tests
     runs-on: [ self-hosted, docker-gpu, single-gpu ]
@@ -83,6 +82,7 @@ jobs:
         python -m pip uninstall -y torch torchvision torchtext
         python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu116
         python -m pip install -e .[quality,test,training]
+        python -m pip install git+https://github.com/huggingface/accelerate
 
     - name: Environment
       run: |
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index d307bd5a076e..c9c58a748831 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -223,6 +223,8 @@ def device(self) -> torch.device:
         for name in module_names.keys():
             module = getattr(self, name)
             if isinstance(module, torch.nn.Module):
+                if module.device == torch.device("meta"):
+                    return torch.device("cpu")
                 return module.device
         return torch.device("cpu")
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 02a6b45fdefc..cf4c5c5fdeca 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -3,6 +3,7 @@
 
 import torch
 
+from diffusers.utils import is_accelerate_available
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -118,6 +119,18 @@ def disable_attention_slicing(self):
         # set slice_size = `None` to disable `attention slicing`
         self.enable_attention_slicing(None)
 
+    def cuda_with_minimal_gpu_usage(self):
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device("cuda")
+        self.enable_attention_slicing(1)
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+            cpu_offload(cpu_offloaded_model, device)
+
     @torch.no_grad()
     def __call__(
         self,
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index a4686366c8e7..6e9388ca3a65 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -535,3 +535,23 @@ def test_stable_diffusion_accelerate_load_reduces_memory_footprint(self):
         tracemalloc.stop()
 
         assert peak_accelerate < peak_normal
+
+    @slow
+    @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU")
+    def test_stable_diffusion_pipeline_with_unet_on_gpu_only(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+
+        pipeline_id = "CompVis/stable-diffusion-v1-4"
+        prompt = "Andromeda galaxy in a bottle"
+
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            pipeline_id, revision="fp16", torch_dtype=torch.float32, use_auth_token=True
+        )
+        pipeline.cuda_with_minimal_gpu_usage()
+
+        _ = pipeline(prompt)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 0.8 GB is allocated
+        assert mem_bytes < 0.8 * 10**9

From bd06dd023f7f01040b6ee1ca463adda8bf2fc25e Mon Sep 17 00:00:00 2001
From: Hu Ye <xiaohuzc@gmail.com>
Date: Wed, 26 Oct 2022 22:41:57 +0800
Subject: [PATCH 054/269] [inpaint pipeline] fix bug for multiple prompts
 inputs (#959)

---
 .../stable_diffusion/pipeline_stable_diffusion_inpaint.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 918a1241f996..d49f55aa921d 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -340,8 +340,8 @@ def __call__(
         masked_image_latents = 0.18215 * masked_image_latents
 
         # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
-        mask = mask.repeat(num_images_per_prompt, 1, 1, 1)
-        masked_image_latents = masked_image_latents.repeat(num_images_per_prompt, 1, 1, 1)
+        mask = mask.repeat(batch_size * num_images_per_prompt, 1, 1, 1)
+        masked_image_latents = masked_image_latents.repeat(batch_size * num_images_per_prompt, 1, 1, 1)
 
         mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
         masked_image_latents = (

From 8332c1a6d956748db288d8edb7797ca38a5ea60e Mon Sep 17 00:00:00 2001
From: Simon Kirsten <1972314+skirsten@users.noreply.github.com>
Date: Wed, 26 Oct 2022 16:24:48 +0100
Subject: [PATCH 055/269] Enable multi-process DataLoader for dreambooth (#950)

---
 examples/dreambooth/train_dreambooth.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index c28bcbd2d762..6dbba9af564c 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -502,7 +502,7 @@ def collate_fn(examples):
         return batch
 
     train_dataloader = torch.utils.data.DataLoader(
-        train_dataset, batch_size=args.train_batch_size, shuffle=True, collate_fn=collate_fn
+        train_dataset, batch_size=args.train_batch_size, shuffle=True, collate_fn=collate_fn, num_workers=1
     )
 
     # Scheduler and math around the number of training steps.

From d3d22ce5a894becb951eec03e663951b28d45135 Mon Sep 17 00:00:00 2001
From: Brian Whicheloe <dev@whicheloe.us>
Date: Wed, 26 Oct 2022 09:46:56 -0700
Subject: [PATCH 056/269] Small modification to enable usage by external
 scripts (#956)

* Make training code usable by external scripts

Add parameter inputs to training and argument parsing function to allow this script to be used by an external call.

* Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 examples/dreambooth/train_dreambooth.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index 6dbba9af564c..6d3880b1a755 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -26,7 +26,7 @@
 logger = get_logger(__name__)
 
 
-def parse_args():
+def parse_args(input_args):
     parser = argparse.ArgumentParser(description="Simple example of a training script.")
     parser.add_argument(
         "--pretrained_model_name_or_path",
@@ -196,7 +196,11 @@ def parse_args():
     )
     parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
 
-    args = parser.parse_args()
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
     env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
     if env_local_rank != -1 and env_local_rank != args.local_rank:
         args.local_rank = env_local_rank
@@ -319,8 +323,7 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
         return f"{organization}/{model_id}"
 
 
-def main():
-    args = parse_args()
+def main(args):
     logging_dir = Path(args.output_dir, args.logging_dir)
 
     accelerator = Accelerator(
@@ -653,4 +656,5 @@ def collate_fn(examples):
 
 
 if __name__ == "__main__":
-    main()
+    args = parse_args()
+    main(args)

From a23ad87d7aecf9e3db7f18287beb6116f96c2313 Mon Sep 17 00:00:00 2001
From: "Duong A. Nguyen" <38061659+duongna21@users.noreply.github.com>
Date: Thu, 27 Oct 2022 03:28:55 +0700
Subject: [PATCH 057/269] [Flax] Add Textual Inversion (#880)

* add textual inversion flax

* make style

* make style

* replicate vae and unet params

* make style

* minor

* save after end of training

* style

* Temporary fix

Co-authored-by: Suraj Patil <surajp815@gmail.com>

* Add Flax instruction

Co-authored-by: Suraj Patil <surajp815@gmail.com>
---
 examples/textual_inversion/README.md          |  18 +
 .../textual_inversion_flax.py                 | 629 ++++++++++++++++++
 2 files changed, 647 insertions(+)
 create mode 100644 examples/textual_inversion/textual_inversion_flax.py

diff --git a/examples/textual_inversion/README.md b/examples/textual_inversion/README.md
index 801c42c8a0f8..969ac722c14f 100644
--- a/examples/textual_inversion/README.md
+++ b/examples/textual_inversion/README.md
@@ -68,6 +68,24 @@ accelerate launch textual_inversion.py \
 
 A full training run takes ~1 hour on one V100 GPU.
 
+If you want to speed it up even more, Flax implementation is available:
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export DATA_DIR="path-to-dir-containing-images"
+
+python textual_inversion_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --learnable_property="object" \
+  --placeholder_token="<cat-toy>" --initializer_token="toy" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --max_train_steps=3000 \
+  --learning_rate=5.0e-04 --scale_lr \
+  --output_dir="textual_inversion_cat"
+```
+It should be at least 70% faster than the PyTorch script with the same configuration.
 
 ### Inference
 
diff --git a/examples/textual_inversion/textual_inversion_flax.py b/examples/textual_inversion/textual_inversion_flax.py
new file mode 100644
index 000000000000..84ff97c39a96
--- /dev/null
+++ b/examples/textual_inversion/textual_inversion_flax.py
@@ -0,0 +1,629 @@
+import argparse
+import logging
+import math
+import os
+import random
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch.utils.data import Dataset
+
+import jax
+import jax.numpy as jnp
+import optax
+import PIL
+import transformers
+from diffusers import (
+    FlaxAutoencoderKL,
+    FlaxDDPMScheduler,
+    FlaxPNDMScheduler,
+    FlaxStableDiffusionPipeline,
+    FlaxUNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionSafetyChecker
+from flax import jax_utils
+from flax.training import train_state
+from flax.training.common_utils import shard
+from huggingface_hub import HfFolder, Repository, whoami
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel, set_seed
+
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--train_data_dir", type=str, default=None, required=True, help="A folder containing the training data."
+    )
+    parser.add_argument(
+        "--placeholder_token",
+        type=str,
+        default=None,
+        required=True,
+        help="A token to use as a placeholder for the concept.",
+    )
+    parser.add_argument(
+        "--initializer_token", type=str, default=None, required=True, help="A token to use as initializer word."
+    )
+    parser.add_argument("--learnable_property", type=str, default="object", help="Choose between 'object' and 'style'")
+    parser.add_argument("--repeats", type=int, default=100, help="How many times to repeat the training data.")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=5000,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=True,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument(
+        "--use_auth_token",
+        action="store_true",
+        help=(
+            "Will use the token generated when running `huggingface-cli login` (necessary to use this script with"
+            " private models)."
+        ),
+    )
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.train_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+
+    return args
+
+
+imagenet_templates_small = [
+    "a photo of a {}",
+    "a rendering of a {}",
+    "a cropped photo of the {}",
+    "the photo of a {}",
+    "a photo of a clean {}",
+    "a photo of a dirty {}",
+    "a dark photo of the {}",
+    "a photo of my {}",
+    "a photo of the cool {}",
+    "a close-up photo of a {}",
+    "a bright photo of the {}",
+    "a cropped photo of a {}",
+    "a photo of the {}",
+    "a good photo of the {}",
+    "a photo of one {}",
+    "a close-up photo of the {}",
+    "a rendition of the {}",
+    "a photo of the clean {}",
+    "a rendition of a {}",
+    "a photo of a nice {}",
+    "a good photo of a {}",
+    "a photo of the nice {}",
+    "a photo of the small {}",
+    "a photo of the weird {}",
+    "a photo of the large {}",
+    "a photo of a cool {}",
+    "a photo of a small {}",
+]
+
+imagenet_style_templates_small = [
+    "a painting in the style of {}",
+    "a rendering in the style of {}",
+    "a cropped painting in the style of {}",
+    "the painting in the style of {}",
+    "a clean painting in the style of {}",
+    "a dirty painting in the style of {}",
+    "a dark painting in the style of {}",
+    "a picture in the style of {}",
+    "a cool painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a bright painting in the style of {}",
+    "a cropped painting in the style of {}",
+    "a good painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a rendition in the style of {}",
+    "a nice painting in the style of {}",
+    "a small painting in the style of {}",
+    "a weird painting in the style of {}",
+    "a large painting in the style of {}",
+]
+
+
+class TextualInversionDataset(Dataset):
+    def __init__(
+        self,
+        data_root,
+        tokenizer,
+        learnable_property="object",  # [object, style]
+        size=512,
+        repeats=100,
+        interpolation="bicubic",
+        flip_p=0.5,
+        set="train",
+        placeholder_token="*",
+        center_crop=False,
+    ):
+        self.data_root = data_root
+        self.tokenizer = tokenizer
+        self.learnable_property = learnable_property
+        self.size = size
+        self.placeholder_token = placeholder_token
+        self.center_crop = center_crop
+        self.flip_p = flip_p
+
+        self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
+
+        self.num_images = len(self.image_paths)
+        self._length = self.num_images
+
+        if set == "train":
+            self._length = self.num_images * repeats
+
+        self.interpolation = {
+            "linear": PIL.Image.LINEAR,
+            "bilinear": PIL.Image.BILINEAR,
+            "bicubic": PIL.Image.BICUBIC,
+            "lanczos": PIL.Image.LANCZOS,
+        }[interpolation]
+
+        self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small
+        self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, i):
+        example = {}
+        image = Image.open(self.image_paths[i % self.num_images])
+
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+
+        placeholder_string = self.placeholder_token
+        text = random.choice(self.templates).format(placeholder_string)
+
+        example["input_ids"] = self.tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids[0]
+
+        # default to score-sde preprocessing
+        img = np.array(image).astype(np.uint8)
+
+        if self.center_crop:
+            crop = min(img.shape[0], img.shape[1])
+            h, w, = (
+                img.shape[0],
+                img.shape[1],
+            )
+            img = img[(h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2]
+
+        image = Image.fromarray(img)
+        image = image.resize((self.size, self.size), resample=self.interpolation)
+
+        image = self.flip_transform(image)
+        image = np.array(image).astype(np.uint8)
+        image = (image / 127.5 - 1.0).astype(np.float32)
+
+        example["pixel_values"] = torch.from_numpy(image).permute(2, 0, 1)
+        return example
+
+
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
+
+
+def resize_token_embeddings(model, new_num_tokens, initializer_token_id, placeholder_token_id, rng):
+    if model.config.vocab_size == new_num_tokens or new_num_tokens is None:
+        return
+    model.config.vocab_size = new_num_tokens
+
+    params = model.params
+    old_embeddings = params["text_model"]["embeddings"]["token_embedding"]["embedding"]
+    old_num_tokens, emb_dim = old_embeddings.shape
+
+    initializer = jax.nn.initializers.normal()
+
+    new_embeddings = initializer(rng, (new_num_tokens, emb_dim))
+    new_embeddings = new_embeddings.at[:old_num_tokens].set(old_embeddings)
+    new_embeddings = new_embeddings.at[placeholder_token_id].set(new_embeddings[initializer_token_id])
+    params["text_model"]["embeddings"]["token_embedding"]["embedding"] = new_embeddings
+
+    model.params = params
+    return model
+
+
+def get_params_to_save(params):
+    return jax.device_get(jax.tree_util.tree_map(lambda x: x[0], params))
+
+
+def main():
+    args = parse_args()
+
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    if jax.process_index() == 0:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    # Setup logging, we only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+
+    # Load the tokenizer and add the placeholder token as a additional special token
+    if args.tokenizer_name:
+        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+
+    # Add the placeholder token in tokenizer
+    num_added_tokens = tokenizer.add_tokens(args.placeholder_token)
+    if num_added_tokens == 0:
+        raise ValueError(
+            f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different"
+            " `placeholder_token` that is not already in the tokenizer."
+        )
+
+    # Convert the initializer_token, placeholder_token to ids
+    token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)
+    # Check if initializer_token is a single token or a sequence of tokens
+    if len(token_ids) > 1:
+        raise ValueError("The initializer token must be a single token.")
+
+    initializer_token_id = token_ids[0]
+    placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
+
+    # Load models and create wrapper for stable diffusion
+    text_encoder = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+    vae, vae_params = FlaxAutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
+    unet, unet_params = FlaxUNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
+
+    # Create sampling rng
+    rng = jax.random.PRNGKey(args.seed)
+    rng, _ = jax.random.split(rng)
+    # Resize the token embeddings as we are adding new special tokens to the tokenizer
+    text_encoder = resize_token_embeddings(
+        text_encoder, len(tokenizer), initializer_token_id, placeholder_token_id, rng
+    )
+    original_token_embeds = text_encoder.params["text_model"]["embeddings"]["token_embedding"]["embedding"]
+
+    train_dataset = TextualInversionDataset(
+        data_root=args.train_data_dir,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        placeholder_token=args.placeholder_token,
+        repeats=args.repeats,
+        learnable_property=args.learnable_property,
+        center_crop=args.center_crop,
+        set="train",
+    )
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        input_ids = torch.stack([example["input_ids"] for example in examples])
+
+        batch = {"pixel_values": pixel_values, "input_ids": input_ids}
+        batch = {k: v.numpy() for k, v in batch.items()}
+
+        return batch
+
+    total_train_batch_size = args.train_batch_size * jax.local_device_count()
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=total_train_batch_size, shuffle=True, drop_last=True, collate_fn=collate_fn
+    )
+
+    # Optimization
+    if args.scale_lr:
+        args.learning_rate = args.learning_rate * total_train_batch_size
+
+    constant_scheduler = optax.constant_schedule(args.learning_rate)
+
+    optimizer = optax.adamw(
+        learning_rate=constant_scheduler,
+        b1=args.adam_beta1,
+        b2=args.adam_beta2,
+        eps=args.adam_epsilon,
+        weight_decay=args.adam_weight_decay,
+    )
+
+    def create_mask(params, label_fn):
+        def _map(params, mask, label_fn):
+            for k in params:
+                if label_fn(k):
+                    mask[k] = "token_embedding"
+                else:
+                    if isinstance(params[k], dict):
+                        mask[k] = {}
+                        _map(params[k], mask[k], label_fn)
+                    else:
+                        mask[k] = "zero"
+
+        mask = {}
+        _map(params, mask, label_fn)
+        return mask
+
+    def zero_grads():
+        # from https://github.com/deepmind/optax/issues/159#issuecomment-896459491
+        def init_fn(_):
+            return ()
+
+        def update_fn(updates, state, params=None):
+            return jax.tree_util.tree_map(jnp.zeros_like, updates), ()
+
+        return optax.GradientTransformation(init_fn, update_fn)
+
+    # Zero out gradients of layers other than the token embedding layer
+    tx = optax.multi_transform(
+        {"token_embedding": optimizer, "zero": zero_grads()},
+        create_mask(text_encoder.params, lambda s: s == "token_embedding"),
+    )
+
+    state = train_state.TrainState.create(apply_fn=text_encoder.__call__, params=text_encoder.params, tx=tx)
+
+    noise_scheduler = FlaxDDPMScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
+    )
+
+    # Initialize our training
+    train_rngs = jax.random.split(rng, jax.local_device_count())
+
+    # Define gradient train step fn
+    def train_step(state, vae_params, unet_params, batch, train_rng):
+        dropout_rng, sample_rng, new_train_rng = jax.random.split(train_rng, 3)
+
+        def compute_loss(params):
+            vae_outputs = vae.apply(
+                {"params": vae_params}, batch["pixel_values"], deterministic=True, method=vae.encode
+            )
+            latents = vae_outputs.latent_dist.sample(sample_rng)
+            # (NHWC) -> (NCHW)
+            latents = jnp.transpose(latents, (0, 3, 1, 2))
+            latents = latents * 0.18215
+
+            noise_rng, timestep_rng = jax.random.split(sample_rng)
+            noise = jax.random.normal(noise_rng, latents.shape)
+            bsz = latents.shape[0]
+            timesteps = jax.random.randint(
+                timestep_rng,
+                (bsz,),
+                0,
+                noise_scheduler.config.num_train_timesteps,
+            )
+            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+            encoder_hidden_states = state.apply_fn(
+                batch["input_ids"], params=params, dropout_rng=dropout_rng, train=True
+            )[0]
+            unet_outputs = unet.apply(
+                {"params": unet_params}, noisy_latents, timesteps, encoder_hidden_states, train=False
+            )
+            noise_pred = unet_outputs.sample
+            loss = (noise - noise_pred) ** 2
+            loss = loss.mean()
+
+            return loss
+
+        grad_fn = jax.value_and_grad(compute_loss)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad)
+
+        # Keep the token embeddings fixed except the newly added embeddings for the concept,
+        # as we only want to optimize the concept embeddings
+        token_embeds = original_token_embeds.at[placeholder_token_id].set(
+            new_state.params["text_model"]["embeddings"]["token_embedding"]["embedding"][placeholder_token_id]
+        )
+        new_state.params["text_model"]["embeddings"]["token_embedding"]["embedding"] = token_embeds
+
+        metrics = {"loss": loss}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+        return new_state, metrics, new_train_rng
+
+    # Create parallel version of the train and eval step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+
+    # Replicate the train state on each device
+    state = jax_utils.replicate(state)
+    vae_params = jax_utils.replicate(vae_params)
+    unet_params = jax_utils.replicate(unet_params)
+
+    # Train!
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
+
+    # Scheduler and math around the number of training steps.
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel & distributed) = {total_train_batch_size}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
+    global_step = 0
+
+    epochs = tqdm(range(args.num_train_epochs), desc=f"Epoch ... (1/{args.num_train_epochs})", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+
+        train_metrics = []
+
+        steps_per_epoch = len(train_dataset) // total_train_batch_size
+        train_step_progress_bar = tqdm(total=steps_per_epoch, desc="Training...", position=1, leave=False)
+        # train
+        for batch in train_dataloader:
+            batch = shard(batch)
+            state, train_metric, train_rngs = p_train_step(state, vae_params, unet_params, batch, train_rngs)
+            train_metrics.append(train_metric)
+
+            train_step_progress_bar.update(1)
+            global_step += 1
+
+            if global_step >= args.max_train_steps:
+                break
+
+        train_metric = jax_utils.unreplicate(train_metric)
+
+        train_step_progress_bar.close()
+        epochs.write(f"Epoch... ({epoch + 1}/{args.num_train_epochs} | Loss: {train_metric['loss']})")
+
+    # Create the pipeline using using the trained modules and save it.
+    if jax.process_index() == 0:
+        scheduler = FlaxPNDMScheduler(
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
+        )
+        safety_checker = FlaxStableDiffusionSafetyChecker.from_pretrained(
+            "CompVis/stable-diffusion-safety-checker", from_pt=True
+        )
+        pipeline = FlaxStableDiffusionPipeline(
+            text_encoder=text_encoder,
+            vae=vae,
+            unet=unet,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32"),
+        )
+
+        pipeline.save_pretrained(
+            args.output_dir,
+            params={
+                "text_encoder": get_params_to_save(state.params),
+                "vae": get_params_to_save(vae_params),
+                "unet": get_params_to_save(unet_params),
+                "safety_checker": safety_checker.params,
+            },
+        )
+
+        # Also save the newly trained embeddings
+        learned_embeds = get_params_to_save(state.params)["text_model"]["embeddings"]["token_embedding"]["embedding"][
+            placeholder_token_id
+        ]
+        learned_embeds_dict = {args.placeholder_token: learned_embeds}
+        jnp.save(os.path.join(args.output_dir, "learned_embeds.npy"), learned_embeds_dict)
+
+        if args.push_to_hub:
+            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
+
+
+if __name__ == "__main__":
+    main()

From 1d04e1b4deb1bdebabd9a94163ea5567e54c6ada Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Thu, 27 Oct 2022 10:21:40 +0200
Subject: [PATCH 058/269] Continuation of #942: additional float64 failure
 (#996)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add failing test for #940.

* Do not use torch.float64 in mps.

* style

* Temporarily skip add_noise for IPNDMScheduler.

Until #990 is addressed.

* Fix additional float64 error in mps.

* Improve add_noise test

* Slight edit – I think it's clearer this way.
---
 src/diffusers/schedulers/scheduling_lms_discrete.py | 10 +++++++---
 tests/test_scheduler.py                             |  3 ++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py
index 8b371703ec84..6157f4b4dc65 100644
--- a/src/diffusers/schedulers/scheduling_lms_discrete.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete.py
@@ -252,9 +252,13 @@ def add_noise(
     ) -> torch.FloatTensor:
         # Make sure sigmas and timesteps have the same device and dtype as original_samples
         self.sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
-        dtype = torch.float32 if original_samples.device.type == "mps" else timesteps.dtype
-        self.timesteps = self.timesteps.to(original_samples.device, dtype=dtype)
-        timesteps = timesteps.to(original_samples.device)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            self.timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            self.timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
 
         schedule_timesteps = self.timesteps
 
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 6f86ffc85e05..194fb66f663f 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -266,13 +266,14 @@ def test_add_noise_device(self):
                 continue
             scheduler_config = self.get_scheduler_config()
             scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(100)
 
             sample = self.dummy_sample.to(torch_device)
             scaled_sample = scheduler.scale_model_input(sample, 0.0)
             self.assertEqual(sample.shape, scaled_sample.shape)
 
             noise = torch.randn_like(scaled_sample).to(torch_device)
-            t = torch.tensor([10]).to(torch_device)
+            t = scheduler.timesteps[5][None]
             noised = scheduler.add_noise(scaled_sample, noise, t)
             self.assertEqual(noised.shape, scaled_sample.shape)
 

From e92a603cab5c2c75c8d7fb348b9edb37deb24018 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Thu, 27 Oct 2022 11:44:06 +0200
Subject: [PATCH 059/269] fix dreambooth script. (#1017)

make input_args optional
---
 examples/dreambooth/train_dreambooth.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index 6d3880b1a755..d3720f0cad2c 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -26,7 +26,7 @@
 logger = get_logger(__name__)
 
 
-def parse_args(input_args):
+def parse_args(input_args: None):
     parser = argparse.ArgumentParser(description="Simple example of a training script.")
     parser.add_argument(
         "--pretrained_model_name_or_path",

From 3be9fa97d6d3829c0678265b3fdb3563f6bad101 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 27 Oct 2022 12:11:42 +0200
Subject: [PATCH 060/269] [Accelerate model loading] Fix meta device and super
 low memory usage (#1016)

* [Accelerate model loading] Fix meta device and super low memory usage

* better naming
---
 .../pipeline_stable_diffusion.py              |  3 +-
 .../stable_diffusion/test_stable_diffusion.py | 37 ++++++++++
 tests/test_pipelines.py                       | 74 +------------------
 3 files changed, 39 insertions(+), 75 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index cf4c5c5fdeca..cca11281359a 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -119,14 +119,13 @@ def disable_attention_slicing(self):
         # set slice_size = `None` to disable `attention slicing`
         self.enable_attention_slicing(None)
 
-    def cuda_with_minimal_gpu_usage(self):
+    def enable_sequential_cpu_offload(self):
         if is_accelerate_available():
             from accelerate import cpu_offload
         else:
             raise ImportError("Please install accelerate via `pip install accelerate`")
 
         device = torch.device("cuda")
-        self.enable_attention_slicing(1)
 
         for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
             cpu_offload(cpu_offloaded_model, device)
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index d7e6c362d1d7..a81710987814 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -15,6 +15,7 @@
 
 import gc
 import random
+import time
 import unittest
 
 import numpy as np
@@ -730,3 +731,39 @@ def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> No
             )
         assert test_callback_fn.has_been_called
         assert number_of_steps == 51
+
+    def test_stable_diffusion_accelerate_auto_device(self):
+        pipeline_id = "CompVis/stable-diffusion-v1-4"
+
+        start_time = time.time()
+        pipeline_normal_load = StableDiffusionPipeline.from_pretrained(
+            pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True
+        )
+        pipeline_normal_load.to(torch_device)
+        normal_load_time = time.time() - start_time
+
+        start_time = time.time()
+        _ = StableDiffusionPipeline.from_pretrained(
+            pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto"
+        )
+        meta_device_load_time = time.time() - start_time
+
+        assert 2 * meta_device_load_time < normal_load_time
+
+    @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU")
+    def test_stable_diffusion_pipeline_with_unet_on_gpu_only(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+
+        pipeline_id = "CompVis/stable-diffusion-v1-4"
+        prompt = "Andromeda galaxy in a bottle"
+
+        pipeline = StableDiffusionPipeline.from_pretrained(pipeline_id, revision="fp16", torch_dtype=torch.float16)
+        pipeline.enable_attention_slicing(1)
+        pipeline.enable_sequential_cpu_offload()
+
+        _ = pipeline(prompt, num_inference_steps=5)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 1.5 GB is allocated
+        assert mem_bytes < 1.5 * 10**9
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 6e9388ca3a65..3cc94962c38f 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -17,15 +17,12 @@
 import os
 import random
 import tempfile
-import tracemalloc
 import unittest
 
 import numpy as np
 import torch
 
-import accelerate
 import PIL
-import transformers
 from diffusers import (
     AutoencoderKL,
     DDIMPipeline,
@@ -44,8 +41,7 @@
 from diffusers.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from diffusers.utils import CONFIG_NAME, WEIGHTS_NAME, floats_tensor, slow, torch_device
-from diffusers.utils.testing_utils import CaptureLogger, get_tests_dir, require_torch_gpu
-from packaging import version
+from diffusers.utils.testing_utils import CaptureLogger, get_tests_dir
 from PIL import Image
 from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
@@ -487,71 +483,3 @@ def test_ddpm_ddim_equality_batched(self):
 
         # the values aren't exactly equal, but the images look the same visually
         assert np.abs(ddpm_images - ddim_images).max() < 1e-1
-
-    @require_torch_gpu
-    def test_stable_diffusion_accelerate_load_works(self):
-        if version.parse(version.parse(transformers.__version__).base_version) < version.parse("4.23"):
-            return
-
-        if version.parse(version.parse(accelerate.__version__).base_version) < version.parse("0.14"):
-            return
-
-        model_id = "CompVis/stable-diffusion-v1-4"
-        _ = StableDiffusionPipeline.from_pretrained(
-            model_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto"
-        ).to(torch_device)
-
-    @require_torch_gpu
-    def test_stable_diffusion_accelerate_load_reduces_memory_footprint(self):
-        if version.parse(version.parse(transformers.__version__).base_version) < version.parse("4.23"):
-            return
-
-        if version.parse(version.parse(accelerate.__version__).base_version) < version.parse("0.14"):
-            return
-
-        pipeline_id = "CompVis/stable-diffusion-v1-4"
-
-        torch.cuda.empty_cache()
-        gc.collect()
-
-        tracemalloc.start()
-        pipeline_normal_load = StableDiffusionPipeline.from_pretrained(
-            pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True
-        )
-        pipeline_normal_load.to(torch_device)
-        _, peak_normal = tracemalloc.get_traced_memory()
-        tracemalloc.stop()
-
-        del pipeline_normal_load
-        torch.cuda.empty_cache()
-        gc.collect()
-
-        tracemalloc.start()
-        _ = StableDiffusionPipeline.from_pretrained(
-            pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto"
-        )
-        _, peak_accelerate = tracemalloc.get_traced_memory()
-
-        tracemalloc.stop()
-
-        assert peak_accelerate < peak_normal
-
-    @slow
-    @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU")
-    def test_stable_diffusion_pipeline_with_unet_on_gpu_only(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-
-        pipeline_id = "CompVis/stable-diffusion-v1-4"
-        prompt = "Andromeda galaxy in a bottle"
-
-        pipeline = StableDiffusionPipeline.from_pretrained(
-            pipeline_id, revision="fp16", torch_dtype=torch.float32, use_auth_token=True
-        )
-        pipeline.cuda_with_minimal_gpu_usage()
-
-        _ = pipeline(prompt)
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 0.8 GB is allocated
-        assert mem_bytes < 0.8 * 10**9

From abe058221c2a58ed100d7524c28fa6fdf23ff528 Mon Sep 17 00:00:00 2001
From: "Duong A. Nguyen" <38061659+duongna21@users.noreply.github.com>
Date: Thu, 27 Oct 2022 19:08:21 +0700
Subject: [PATCH 061/269] [Flax] Add finetune Stable Diffusion (#999)

* [Flax] Add finetune Stable Diffusion

* temporary fix

* drop_last and seed

* add dtype for mixed precision training

* style

* Add Flax example
---
 examples/text_to_image/README.md              |  36 ++
 .../text_to_image/train_text_to_image_flax.py | 558 ++++++++++++++++++
 2 files changed, 594 insertions(+)
 create mode 100644 examples/text_to_image/train_text_to_image_flax.py

diff --git a/examples/text_to_image/README.md b/examples/text_to_image/README.md
index 6aca642cda4a..64385ddae0b1 100644
--- a/examples/text_to_image/README.md
+++ b/examples/text_to_image/README.md
@@ -62,6 +62,24 @@ accelerate launch train_text_to_image.py \
   --output_dir="sd-pokemon-model" 
 ```
 
+Or use the Flax implementation if you need a speedup
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export dataset_name="lambdalabs/pokemon-blip-captions"
+
+python train_text_to_image_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$dataset_name \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --mixed_precision="fp16" \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --output_dir="sd-pokemon-model" 
+```
+
 
 To run on your own training files prepare the dataset according to the format required by `datasets`, you can find the instructions for how to do that in this [document](https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder-with-metadata).
 If you wish to use custom loading logic, you should modify the script, we have left pointers for that in the training script.
@@ -86,6 +104,24 @@ accelerate launch train_text_to_image.py \
   --output_dir="sd-pokemon-model"
 ```
 
+Or use the Flax implementation if you need a speedup
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export TRAIN_DIR="path_to_your_dataset"
+
+python train_text_to_image_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$TRAIN_DIR \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --mixed_precision="fp16" \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --output_dir="sd-pokemon-model"
+```
+
 Once the training is finished the model will be saved in the `output_dir` specified in the command. In this example it's `sd-pokemon-model`. To load the fine-tuned model for inference just pass that path to `StableDiffusionPipeline`
 
 
diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py
new file mode 100644
index 000000000000..ab8d2b7ee2a3
--- /dev/null
+++ b/examples/text_to_image/train_text_to_image_flax.py
@@ -0,0 +1,558 @@
+import argparse
+import logging
+import math
+import os
+import random
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+
+import jax
+import jax.numpy as jnp
+import optax
+import transformers
+from datasets import load_dataset
+from diffusers import (
+    FlaxAutoencoderKL,
+    FlaxDDPMScheduler,
+    FlaxPNDMScheduler,
+    FlaxStableDiffusionPipeline,
+    FlaxUNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionSafetyChecker
+from flax import jax_utils
+from flax.training import train_state
+from flax.training.common_utils import shard
+from huggingface_hub import HfFolder, Repository, whoami
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel, set_seed
+
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd-model-finetuned",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        action="store_true",
+        help="Whether to center crop images before resizing to resolution (if not set, random crop will be used)",
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
+            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            "Only applicable when `--with_tracking` is passed."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    return args
+
+
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
+
+
+dataset_name_mapping = {
+    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+}
+
+
+def get_params_to_save(params):
+    return jax.device_get(jax.tree_util.tree_map(lambda x: x[0], params))
+
+
+def main():
+    args = parse_args()
+
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    # Setup logging, we only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if jax.process_index() == 0:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = dataset_name_mapping.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        inputs = tokenizer(captions, max_length=tokenizer.model_max_length, padding="do_not_pad", truncation=True)
+        input_ids = inputs.input_ids
+        return input_ids
+
+    train_transforms = transforms.Compose(
+        [
+            transforms.Resize((args.resolution, args.resolution), interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["pixel_values"] = [train_transforms(image) for image in images]
+        examples["input_ids"] = tokenize_captions(examples)
+
+        return examples
+
+    if jax.process_index() == 0:
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        input_ids = [example["input_ids"] for example in examples]
+
+        padded_tokens = tokenizer.pad(
+            {"input_ids": input_ids}, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pt"
+        )
+        batch = {
+            "pixel_values": pixel_values,
+            "input_ids": padded_tokens.input_ids,
+        }
+        batch = {k: v.numpy() for k, v in batch.items()}
+
+        return batch
+
+    total_train_batch_size = args.train_batch_size * jax.local_device_count()
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, shuffle=True, collate_fn=collate_fn, batch_size=total_train_batch_size, drop_last=True
+    )
+
+    weight_dtype = torch.float32
+    if args.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif args.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Load models and create wrapper for stable diffusion
+    tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+    text_encoder = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", dtype=weight_dtype)
+    vae, vae_params = FlaxAutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vae", dtype=weight_dtype
+    )
+    unet, unet_params = FlaxUNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", dtype=weight_dtype
+    )
+
+    # Optimization
+    if args.scale_lr:
+        args.learning_rate = args.learning_rate * total_train_batch_size
+
+    constant_scheduler = optax.constant_schedule(args.learning_rate)
+
+    adamw = optax.adamw(
+        learning_rate=constant_scheduler,
+        b1=args.adam_beta1,
+        b2=args.adam_beta2,
+        eps=args.adam_epsilon,
+        weight_decay=args.adam_weight_decay,
+    )
+
+    optimizer = optax.chain(
+        optax.clip_by_global_norm(args.max_grad_norm),
+        adamw,
+    )
+
+    state = train_state.TrainState.create(apply_fn=unet.__call__, params=unet_params, tx=optimizer)
+
+    noise_scheduler = FlaxDDPMScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
+    )
+
+    # Initialize our training
+    rng = jax.random.PRNGKey(args.seed)
+    train_rngs = jax.random.split(rng, jax.local_device_count())
+
+    def train_step(state, text_encoder_params, vae_params, batch, train_rng):
+        dropout_rng, sample_rng, new_train_rng = jax.random.split(train_rng, 3)
+
+        def compute_loss(params):
+            # Convert images to latent space
+            vae_outputs = vae.apply(
+                {"params": vae_params}, batch["pixel_values"], deterministic=True, method=vae.encode
+            )
+            latents = vae_outputs.latent_dist.sample(sample_rng)
+            # (NHWC) -> (NCHW)
+            latents = jnp.transpose(latents, (0, 3, 1, 2))
+            latents = latents * 0.18215
+
+            # Sample noise that we'll add to the latents
+            noise_rng, timestep_rng = jax.random.split(sample_rng)
+            noise = jax.random.normal(noise_rng, latents.shape)
+            # Sample a random timestep for each image
+            bsz = latents.shape[0]
+            timesteps = jax.random.randint(
+                timestep_rng,
+                (bsz,),
+                0,
+                noise_scheduler.config.num_train_timesteps,
+            )
+
+            # Add noise to the latents according to the noise magnitude at each timestep
+            # (this is the forward diffusion process)
+            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+            # Get the text embedding for conditioning
+            encoder_hidden_states = text_encoder(
+                batch["input_ids"],
+                params=text_encoder_params,
+                train=False,
+            )[0]
+
+            # Predict the noise residual and compute loss
+            unet_outputs = unet.apply({"params": params}, noisy_latents, timesteps, encoder_hidden_states, train=True)
+            noise_pred = unet_outputs.sample
+            loss = (noise - noise_pred) ** 2
+            loss = loss.mean()
+
+            return loss
+
+        grad_fn = jax.value_and_grad(compute_loss)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+
+        new_state = state.apply_gradients(grads=grad)
+
+        metrics = {"loss": loss}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+
+        return new_state, metrics, new_train_rng
+
+    # Create parallel version of the train step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+
+    # Replicate the train state on each device
+    state = jax_utils.replicate(state)
+    text_encoder_params = jax_utils.replicate(text_encoder.params)
+    vae_params = jax_utils.replicate(vae_params)
+
+    # Train!
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
+
+    # Scheduler and math around the number of training steps.
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel & distributed) = {total_train_batch_size}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
+    global_step = 0
+
+    epochs = tqdm(range(args.num_train_epochs), desc="Epoch ... ", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+
+        train_metrics = []
+
+        steps_per_epoch = len(train_dataset) // total_train_batch_size
+        train_step_progress_bar = tqdm(total=steps_per_epoch, desc="Training...", position=1, leave=False)
+        # train
+        for batch in train_dataloader:
+            batch = shard(batch)
+            state, train_metric, train_rngs = p_train_step(state, text_encoder_params, vae_params, batch, train_rngs)
+            train_metrics.append(train_metric)
+
+            train_step_progress_bar.update(1)
+
+            global_step += 1
+            if global_step >= args.max_train_steps:
+                break
+
+        train_metric = jax_utils.unreplicate(train_metric)
+
+        train_step_progress_bar.close()
+        epochs.write(f"Epoch... ({epoch + 1}/{args.num_train_epochs} | Loss: {train_metric['loss']})")
+
+    # Create the pipeline using using the trained modules and save it.
+    if jax.process_index() == 0:
+        scheduler = FlaxPNDMScheduler(
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
+        )
+        safety_checker = FlaxStableDiffusionSafetyChecker.from_pretrained(
+            "CompVis/stable-diffusion-safety-checker", from_pt=True
+        )
+        pipeline = FlaxStableDiffusionPipeline(
+            text_encoder=text_encoder,
+            vae=vae,
+            unet=unet,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32"),
+        )
+
+        pipeline.save_pretrained(
+            args.output_dir,
+            params={
+                "text_encoder": get_params_to_save(text_encoder_params),
+                "vae": get_params_to_save(vae_params),
+                "unet": get_params_to_save(state.params),
+                "safety_checker": safety_checker.params,
+            },
+        )
+
+        if args.push_to_hub:
+            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
+
+
+if __name__ == "__main__":
+    main()

From 4623f095f3ac00582369bb0a13d208c2072b049d Mon Sep 17 00:00:00 2001
From: "Duong A. Nguyen" <38061659+duongna21@users.noreply.github.com>
Date: Thu, 27 Oct 2022 19:19:13 +0700
Subject: [PATCH 062/269] [DreamBooth] Set train mode for text encoder (#1012)

Set train mode for text encoder
---
 examples/dreambooth/train_dreambooth.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index d3720f0cad2c..9b7e17241db0 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -574,6 +574,8 @@ def collate_fn(examples):
 
     for epoch in range(args.num_train_epochs):
         unet.train()
+        if args.train_text_encoder:
+            text_encoder.train()
         for step, batch in enumerate(train_dataloader):
             with accelerator.accumulate(unet):
                 # Convert images to latent space

From 90f91adb0ebd6d04c4f4e5dd87afd08acff16c9e Mon Sep 17 00:00:00 2001
From: "Duong A. Nguyen" <38061659+duongna21@users.noreply.github.com>
Date: Thu, 27 Oct 2022 19:25:04 +0700
Subject: [PATCH 063/269] [Flax] Add DreamBooth (#1001)

* [Flax] Add DreamBooth

* fix sample rng

* style

* not reuse rng

* add dtype for mixed precision training

* Add Flax example
---
 examples/dreambooth/README.md                |  65 ++
 examples/dreambooth/train_dreambooth_flax.py | 665 +++++++++++++++++++
 2 files changed, 730 insertions(+)
 create mode 100644 examples/dreambooth/train_dreambooth_flax.py

diff --git a/examples/dreambooth/README.md b/examples/dreambooth/README.md
index fb4bb900a953..6cae5ca2f245 100644
--- a/examples/dreambooth/README.md
+++ b/examples/dreambooth/README.md
@@ -58,6 +58,24 @@ accelerate launch train_dreambooth.py \
   --max_train_steps=400
 ```
 
+Or use the Flax implementation if you need a speedup
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export INSTANCE_DIR="path-to-instance-images"
+export OUTPUT_DIR="path-to-save-model"
+
+python train_dreambooth_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --learning_rate=5e-6 \
+  --max_train_steps=400
+```
+
 ### Training with prior-preservation loss
 
 Prior-preservation is used to avoid overfitting and language-drift. Refer to the paper to learn more about it. For prior-preservation we first generate images using the model with a class prompt and then use those during training along with our data.
@@ -87,6 +105,29 @@ accelerate launch train_dreambooth.py \
   --max_train_steps=800
 ```
 
+Or use the Flax implementation if you need a speedup
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+python train_dreambooth_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --learning_rate=5e-6 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
+
 ### Training on a 16GB GPU:
 
 With the help of gradient checkpointing and the 8-bit optimizer from bitsandbytes it's possible to run train dreambooth on a 16GB GPU.
@@ -193,6 +234,30 @@ accelerate launch train_dreambooth.py \
   --max_train_steps=800
 ```
 
+Or use the Flax implementation if you need a speedup
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+python train_dreambooth_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --train_text_encoder \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --learning_rate=2e-6 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
+
 ## Inference
 
 Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `identifier`(e.g. sks in above example) in your prompt.
diff --git a/examples/dreambooth/train_dreambooth_flax.py b/examples/dreambooth/train_dreambooth_flax.py
new file mode 100644
index 000000000000..d2652606b551
--- /dev/null
+++ b/examples/dreambooth/train_dreambooth_flax.py
@@ -0,0 +1,665 @@
+import argparse
+import hashlib
+import logging
+import math
+import os
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch.utils.data import Dataset
+
+import jax
+import jax.numpy as jnp
+import optax
+import transformers
+from diffusers import (
+    FlaxAutoencoderKL,
+    FlaxDDPMScheduler,
+    FlaxPNDMScheduler,
+    FlaxStableDiffusionPipeline,
+    FlaxUNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionSafetyChecker
+from flax import jax_utils
+from flax.training import train_state
+from flax.training.common_utils import shard
+from huggingface_hub import HfFolder, Repository, whoami
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel, set_seed
+
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.")
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If not have enough images, additional images will be"
+            " sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
+    )
+    parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder")
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.instance_data_dir is None:
+        raise ValueError("You must specify a train data directory.")
+
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+
+    return args
+
+
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        size=512,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+
+        self.instance_data_root = Path(instance_data_root)
+        if not self.instance_data_root.exists():
+            raise ValueError("Instance images root doesn't exists.")
+
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_prompt = instance_prompt
+        self._length = self.num_instance_images
+
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+            self.class_prompt = class_prompt
+        else:
+            self.class_data_root = None
+
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+
+    def __len__(self):
+        return self._length
+
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+        example["instance_prompt_ids"] = self.tokenizer(
+            self.instance_prompt,
+            padding="do_not_pad",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+        ).input_ids
+
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt_ids"] = self.tokenizer(
+                self.class_prompt,
+                padding="do_not_pad",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids
+
+        return example
+
+
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+
+
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
+
+
+def get_params_to_save(params):
+    return jax.device_get(jax.tree_util.tree_map(lambda x: x[0], params))
+
+
+def main():
+    args = parse_args()
+
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    # Setup logging, we only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    if jax.process_index() == 0:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+    rng = jax.random.PRNGKey(args.seed)
+
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+
+        if cur_class_images < args.num_class_images:
+            pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path, safety_checker=None
+            )
+            pipeline.set_progress_bar_config(disable=True)
+
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not jax.process_index() == 0
+            ):
+                prompt_ids = pipeline.prepare_inputs(example["prompt"])
+                prompt_ids = shard(prompt_ids)
+                p_params = jax_utils.replicate(params)
+                rng = jax.random.split(rng)[0]
+                sample_rng = jax.random.split(rng, jax.device_count())
+                images = pipeline(prompt_ids, p_params, sample_rng, jit=True).images
+                images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
+                images = pipeline.numpy_to_pil(np.array(images))
+
+                for i, image in enumerate(images):
+                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+
+            del pipeline
+
+    # Handle the repository creation
+    if jax.process_index() == 0:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+    # Load the tokenizer and add the placeholder token as a additional special token
+    if args.tokenizer_name:
+        tokenizer = CLIPTokenizer.from_pretrained(args.tokenizer_name)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
+
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_prompt=args.class_prompt,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        center_crop=args.center_crop,
+    )
+
+    def collate_fn(examples):
+        input_ids = [example["instance_prompt_ids"] for example in examples]
+        pixel_values = [example["instance_images"] for example in examples]
+
+        # Concat class and instance examples for prior preservation.
+        # We do this to avoid doing two forward passes.
+        if args.with_prior_preservation:
+            input_ids += [example["class_prompt_ids"] for example in examples]
+            pixel_values += [example["class_images"] for example in examples]
+
+        pixel_values = torch.stack(pixel_values)
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+
+        input_ids = tokenizer.pad(
+            {"input_ids": input_ids}, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pt"
+        ).input_ids
+
+        batch = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+        }
+        batch = {k: v.numpy() for k, v in batch.items()}
+        return batch
+
+    total_train_batch_size = args.train_batch_size * jax.local_device_count()
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=total_train_batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True
+    )
+
+    weight_dtype = jnp.float32
+    if args.mixed_precision == "fp16":
+        weight_dtype = jnp.float16
+    elif args.mixed_precision == "bf16":
+        weight_dtype = jnp.bfloat16
+
+    # Load models and create wrapper for stable diffusion
+    text_encoder = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", dtype=weight_dtype)
+    vae, vae_params = FlaxAutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vae", dtype=weight_dtype
+    )
+    unet, unet_params = FlaxUNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", dtype=weight_dtype
+    )
+
+    # Optimization
+    if args.scale_lr:
+        args.learning_rate = args.learning_rate * total_train_batch_size
+
+    constant_scheduler = optax.constant_schedule(args.learning_rate)
+
+    adamw = optax.adamw(
+        learning_rate=constant_scheduler,
+        b1=args.adam_beta1,
+        b2=args.adam_beta2,
+        eps=args.adam_epsilon,
+        weight_decay=args.adam_weight_decay,
+    )
+
+    optimizer = optax.chain(
+        optax.clip_by_global_norm(args.max_grad_norm),
+        adamw,
+    )
+
+    unet_state = train_state.TrainState.create(apply_fn=unet.__call__, params=unet_params, tx=optimizer)
+    text_encoder_state = train_state.TrainState.create(
+        apply_fn=text_encoder.__call__, params=text_encoder.params, tx=optimizer
+    )
+
+    noise_scheduler = FlaxDDPMScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
+    )
+
+    # Initialize our training
+    train_rngs = jax.random.split(rng, jax.local_device_count())
+
+    def train_step(unet_state, text_encoder_state, vae_params, batch, train_rng):
+        dropout_rng, sample_rng, new_train_rng = jax.random.split(train_rng, 3)
+
+        if args.train_text_encoder:
+            params = {"text_encoder": text_encoder_state.params, "unet": unet_state.params}
+        else:
+            params = {"unet": unet_state.params}
+
+        def compute_loss(params):
+            # Convert images to latent space
+            vae_outputs = vae.apply(
+                {"params": vae_params}, batch["pixel_values"], deterministic=True, method=vae.encode
+            )
+            latents = vae_outputs.latent_dist.sample(sample_rng)
+            # (NHWC) -> (NCHW)
+            latents = jnp.transpose(latents, (0, 3, 1, 2))
+            latents = latents * 0.18215
+
+            # Sample noise that we'll add to the latents
+            noise_rng, timestep_rng = jax.random.split(sample_rng)
+            noise = jax.random.normal(noise_rng, latents.shape)
+            # Sample a random timestep for each image
+            bsz = latents.shape[0]
+            timesteps = jax.random.randint(
+                timestep_rng,
+                (bsz,),
+                0,
+                noise_scheduler.config.num_train_timesteps,
+            )
+
+            # Add noise to the latents according to the noise magnitude at each timestep
+            # (this is the forward diffusion process)
+            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+            # Get the text embedding for conditioning
+            if args.train_text_encoder:
+                encoder_hidden_states = text_encoder_state.apply_fn(
+                    batch["input_ids"], params=params["text_encoder"], dropout_rng=dropout_rng, train=True
+                )[0]
+            else:
+                encoder_hidden_states = text_encoder(
+                    batch["input_ids"], params=text_encoder_state.params, train=False
+                )[0]
+
+            # Predict the noise residual
+            unet_outputs = unet.apply(
+                {"params": params["unet"]}, noisy_latents, timesteps, encoder_hidden_states, train=True
+            )
+            noise_pred = unet_outputs.sample
+
+            if args.with_prior_preservation:
+                # Chunk the noise and noise_pred into two parts and compute the loss on each part separately.
+                noise_pred, noise_pred_prior = jnp.split(noise_pred, 2, axis=0)
+                noise, noise_prior = jnp.split(noise, 2, axis=0)
+
+                # Compute instance loss
+                loss = (noise - noise_pred) ** 2
+                loss = loss.mean()
+
+                # Compute prior loss
+                prior_loss = (noise_prior - noise_pred_prior) ** 2
+                prior_loss = prior_loss.mean()
+
+                # Add the prior loss to the instance loss.
+                loss = loss + args.prior_loss_weight * prior_loss
+            else:
+                loss = (noise - noise_pred) ** 2
+                loss = loss.mean()
+
+            return loss
+
+        grad_fn = jax.value_and_grad(compute_loss)
+        loss, grad = grad_fn(params)
+        grad = jax.lax.pmean(grad, "batch")
+
+        new_unet_state = unet_state.apply_gradients(grads=grad["unet"])
+        if args.train_text_encoder:
+            new_text_encoder_state = text_encoder_state.apply_gradients(grads=grad["text_encoder"])
+        else:
+            new_text_encoder_state = text_encoder_state
+
+        metrics = {"loss": loss}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+
+        return new_unet_state, new_text_encoder_state, metrics, new_train_rng
+
+    # Create parallel version of the train step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0, 1))
+
+    # Replicate the train state on each device
+    unet_state = jax_utils.replicate(unet_state)
+    text_encoder_state = jax_utils.replicate(text_encoder_state)
+    vae_params = jax_utils.replicate(vae_params)
+
+    # Train!
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
+
+    # Scheduler and math around the number of training steps.
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel & distributed) = {total_train_batch_size}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
+    global_step = 0
+
+    epochs = tqdm(range(args.num_train_epochs), desc="Epoch ... ", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+
+        train_metrics = []
+
+        steps_per_epoch = len(train_dataset) // total_train_batch_size
+        train_step_progress_bar = tqdm(total=steps_per_epoch, desc="Training...", position=1, leave=False)
+        # train
+        for batch in train_dataloader:
+            batch = shard(batch)
+            unet_state, text_encoder_state, train_metric, train_rngs = p_train_step(
+                unet_state, text_encoder_state, vae_params, batch, train_rngs
+            )
+            train_metrics.append(train_metric)
+
+            train_step_progress_bar.update(1)
+
+            global_step += 1
+            if global_step >= args.max_train_steps:
+                break
+
+        train_metric = jax_utils.unreplicate(train_metric)
+
+        train_step_progress_bar.close()
+        epochs.write(f"Epoch... ({epoch + 1}/{args.num_train_epochs} | Loss: {train_metric['loss']})")
+
+    # Create the pipeline using using the trained modules and save it.
+    if jax.process_index() == 0:
+        scheduler = FlaxPNDMScheduler(
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
+        )
+        safety_checker = FlaxStableDiffusionSafetyChecker.from_pretrained(
+            "CompVis/stable-diffusion-safety-checker", from_pt=True
+        )
+        pipeline = FlaxStableDiffusionPipeline(
+            text_encoder=text_encoder,
+            vae=vae,
+            unet=unet,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32"),
+        )
+
+        pipeline.save_pretrained(
+            args.output_dir,
+            params={
+                "text_encoder": get_params_to_save(text_encoder_state.params),
+                "vae": get_params_to_save(vae_params),
+                "unet": get_params_to_save(unet_state.params),
+                "safety_checker": safety_checker.params,
+            },
+        )
+
+        if args.push_to_hub:
+            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
+
+
+if __name__ == "__main__":
+    main()

From fbcc383340bfd6376ba91cf941d43d95596927fe Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Thu, 27 Oct 2022 15:16:59 +0200
Subject: [PATCH 064/269] Deprecate `init_git_repo`, refactor
 `train_unconditional.py` (#1022)

Deprecate `init_git_repo` and `push_to_hub`, refactor `train_unconditional.py`
---
 .../train_unconditional.py                    | 228 +++++++++++++-----
 src/diffusers/hub_utils.py                    |  13 +-
 2 files changed, 186 insertions(+), 55 deletions(-)

diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py
index f9f8c85bd1e3..6a1495b34c9f 100644
--- a/examples/unconditional_image_generation/train_unconditional.py
+++ b/examples/unconditional_image_generation/train_unconditional.py
@@ -1,6 +1,8 @@
 import argparse
 import math
 import os
+from pathlib import Path
+from typing import Optional
 
 import torch
 import torch.nn.functional as F
@@ -9,9 +11,9 @@
 from accelerate.logging import get_logger
 from datasets import load_dataset
 from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
-from diffusers.hub_utils import init_git_repo
 from diffusers.optimization import get_scheduler
 from diffusers.training_utils import EMAModel
+from huggingface_hub import HfFolder, Repository, whoami
 from torchvision.transforms import (
     CenterCrop,
     Compose,
@@ -27,6 +29,160 @@
 logger = get_logger(__name__)
 
 
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that HF Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="ddpm-model-64",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--overwrite_output_dir", action="store_true")
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=64,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--eval_batch_size", type=int, default=16, help="Batch size (per device) for the eval dataloader."
+    )
+    parser.add_argument("--num_epochs", type=int, default=100)
+    parser.add_argument("--save_images_epochs", type=int, default=10, help="How often to save images during training.")
+    parser.add_argument(
+        "--save_model_epochs", type=int, default=10, help="How often to save the model during training."
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="cosine",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.95, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument(
+        "--adam_weight_decay", type=float, default=1e-6, help="Weight decay magnitude for the Adam optimizer."
+    )
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer.")
+    parser.add_argument(
+        "--use_ema",
+        action="store_true",
+        default=True,
+        help="Whether to use Exponential Moving Average for the final model weights.",
+    )
+    parser.add_argument("--ema_inv_gamma", type=float, default=1.0, help="The inverse gamma value for the EMA decay.")
+    parser.add_argument("--ema_power", type=float, default=3 / 4, help="The power value for the EMA decay.")
+    parser.add_argument("--ema_max_decay", type=float, default=0.9999, help="The maximum decay magnitude for EMA.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--hub_private_repo", action="store_true", help="Whether or not to create a private repository."
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("You must specify either a dataset name from the hub or a train data directory.")
+
+    return args
+
+
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
+
+
 def main(args):
     logging_dir = os.path.join(args.output_dir, args.logging_dir)
     accelerator = Accelerator(
@@ -110,8 +266,22 @@ def transforms(examples):
 
     ema_model = EMAModel(model, inv_gamma=args.ema_inv_gamma, power=args.ema_power, max_value=args.ema_max_decay)
 
-    if args.push_to_hub:
-        repo = init_git_repo(args, at_init=True)
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
 
     if accelerator.is_main_process:
         run = os.path.split(__file__)[-1].split(".")[0]
@@ -193,55 +363,5 @@ def transforms(examples):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Simple example of a training script.")
-    parser.add_argument("--local_rank", type=int, default=-1)
-    parser.add_argument("--dataset_name", type=str, default=None)
-    parser.add_argument("--dataset_config_name", type=str, default=None)
-    parser.add_argument("--train_data_dir", type=str, default=None, help="A folder containing the training data.")
-    parser.add_argument("--output_dir", type=str, default="ddpm-model-64")
-    parser.add_argument("--overwrite_output_dir", action="store_true")
-    parser.add_argument("--cache_dir", type=str, default=None)
-    parser.add_argument("--resolution", type=int, default=64)
-    parser.add_argument("--train_batch_size", type=int, default=16)
-    parser.add_argument("--eval_batch_size", type=int, default=16)
-    parser.add_argument("--num_epochs", type=int, default=100)
-    parser.add_argument("--save_images_epochs", type=int, default=10)
-    parser.add_argument("--save_model_epochs", type=int, default=10)
-    parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
-    parser.add_argument("--learning_rate", type=float, default=1e-4)
-    parser.add_argument("--lr_scheduler", type=str, default="cosine")
-    parser.add_argument("--lr_warmup_steps", type=int, default=500)
-    parser.add_argument("--adam_beta1", type=float, default=0.95)
-    parser.add_argument("--adam_beta2", type=float, default=0.999)
-    parser.add_argument("--adam_weight_decay", type=float, default=1e-6)
-    parser.add_argument("--adam_epsilon", type=float, default=1e-08)
-    parser.add_argument("--use_ema", action="store_true", default=True)
-    parser.add_argument("--ema_inv_gamma", type=float, default=1.0)
-    parser.add_argument("--ema_power", type=float, default=3 / 4)
-    parser.add_argument("--ema_max_decay", type=float, default=0.9999)
-    parser.add_argument("--push_to_hub", action="store_true")
-    parser.add_argument("--hub_token", type=str, default=None)
-    parser.add_argument("--hub_model_id", type=str, default=None)
-    parser.add_argument("--hub_private_repo", action="store_true")
-    parser.add_argument("--logging_dir", type=str, default="logs")
-    parser.add_argument(
-        "--mixed_precision",
-        type=str,
-        default="no",
-        choices=["no", "fp16", "bf16"],
-        help=(
-            "Whether to use mixed precision. Choose"
-            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
-            "and an Nvidia Ampere GPU."
-        ),
-    )
-
-    args = parser.parse_args()
-    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
-    if env_local_rank != -1 and env_local_rank != args.local_rank:
-        args.local_rank = env_local_rank
-
-    if args.dataset_name is None and args.train_data_dir is None:
-        raise ValueError("You must specify either a dataset name from the hub or a train data directory.")
-
+    args = parse_args()
     main(args)
diff --git a/src/diffusers/hub_utils.py b/src/diffusers/hub_utils.py
index c07329e36fe7..1f8cc0db0fc9 100644
--- a/src/diffusers/hub_utils.py
+++ b/src/diffusers/hub_utils.py
@@ -22,7 +22,7 @@
 from huggingface_hub import HfFolder, Repository, whoami
 
 from .pipeline_utils import DiffusionPipeline
-from .utils import is_modelcards_available, logging
+from .utils import deprecate, is_modelcards_available, logging
 
 
 if is_modelcards_available():
@@ -53,6 +53,12 @@ def init_git_repo(args, at_init: bool = False):
             Whether this function is called before any training or not. If `self.args.overwrite_output_dir` is `True`
             and `at_init` is `True`, the path to the repo (which is `self.args.output_dir`) might be wiped out.
     """
+    deprecation_message = (
+        "Please use `huggingface_hub.Repository`. "
+        "See `examples/unconditional_image_generation/train_unconditional.py` for an example."
+    )
+    deprecate("init_git_repo()", "0.10.0", deprecation_message)
+
     if hasattr(args, "local_rank") and args.local_rank not in [-1, 0]:
         return
     hub_token = args.hub_token if hasattr(args, "hub_token") else None
@@ -114,6 +120,11 @@ def push_to_hub(
         The url of the commit of your model in the given repository if `blocking=False`, a tuple with the url of the
         commit and an object to track the progress of the commit if `blocking=True`
     """
+    deprecation_message = (
+        "Please use `huggingface_hub.Repository` and `Repository.push_to_hub()`. "
+        "See `examples/unconditional_image_generation/train_unconditional.py` for an example."
+    )
+    deprecate("push_to_hub()", "0.10.0", deprecation_message)
 
     if not hasattr(args, "hub_model_id") or args.hub_model_id is None:
         model_name = Path(args.output_dir).name

From 52f2128dc6fb6b40b3619f7e0ce2810a4d3982a7 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Thu, 27 Oct 2022 15:25:25 +0200
Subject: [PATCH 065/269] update readme for flax examples (#1026)

---
 examples/dreambooth/README.md                 | 132 ++++++++++--------
 examples/dreambooth/requirements_flax.txt     |   9 ++
 examples/text_to_image/README.md              |  80 ++++++-----
 examples/text_to_image/requirements_flax.txt  |   9 ++
 examples/textual_inversion/README.md          |  49 ++++---
 .../textual_inversion/requirements_flax.txt   |   9 ++
 6 files changed, 179 insertions(+), 109 deletions(-)
 create mode 100644 examples/dreambooth/requirements_flax.txt
 create mode 100644 examples/text_to_image/requirements_flax.txt
 create mode 100644 examples/textual_inversion/requirements_flax.txt

diff --git a/examples/dreambooth/README.md b/examples/dreambooth/README.md
index 6cae5ca2f245..5b091aece3ce 100644
--- a/examples/dreambooth/README.md
+++ b/examples/dreambooth/README.md
@@ -4,7 +4,7 @@
 The `train_dreambooth.py` script shows how to implement the training procedure and adapt it for stable diffusion.
 
 
-## Running locally 
+## Running locally with PyTorch
 ### Installing the dependencies
 
 Before running the scripts, make sure to install the library's training dependencies:
@@ -58,24 +58,6 @@ accelerate launch train_dreambooth.py \
   --max_train_steps=400
 ```
 
-Or use the Flax implementation if you need a speedup
-
-```bash
-export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
-export INSTANCE_DIR="path-to-instance-images"
-export OUTPUT_DIR="path-to-save-model"
-
-python train_dreambooth_flax.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --instance_prompt="a photo of sks dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --learning_rate=5e-6 \
-  --max_train_steps=400
-```
-
 ### Training with prior-preservation loss
 
 Prior-preservation is used to avoid overfitting and language-drift. Refer to the paper to learn more about it. For prior-preservation we first generate images using the model with a class prompt and then use those during training along with our data.
@@ -105,28 +87,6 @@ accelerate launch train_dreambooth.py \
   --max_train_steps=800
 ```
 
-Or use the Flax implementation if you need a speedup
-
-```bash
-export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
-export INSTANCE_DIR="path-to-instance-images"
-export CLASS_DIR="path-to-class-images"
-export OUTPUT_DIR="path-to-save-model"
-
-python train_dreambooth_flax.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --class_data_dir=$CLASS_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --with_prior_preservation --prior_loss_weight=1.0 \
-  --instance_prompt="a photo of sks dog" \
-  --class_prompt="a photo of dog" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --learning_rate=5e-6 \
-  --num_class_images=200 \
-  --max_train_steps=800
-```
 
 ### Training on a 16GB GPU:
 
@@ -234,7 +194,58 @@ accelerate launch train_dreambooth.py \
   --max_train_steps=800
 ```
 
-Or use the Flax implementation if you need a speedup
+### Inference
+
+Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `identifier`(e.g. sks in above example) in your prompt.
+
+```python
+from diffusers import StableDiffusionPipeline
+import torch
+
+model_id = "path-to-your-trained-model"
+pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+
+prompt = "A photo of sks dog in a bucket"
+image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
+
+image.save("dog-bucket.png")
+```
+
+
+## Running with Flax/JAX
+
+For faster training on TPUs and GPUs you can leverage the flax training example. Follow the instructions above to get the model and dataset before running the script.
+
+____Note: The flax example don't yet support features like gradient checkpoint, gradient accumulation etc, so to use flax for faster training we will need >30GB cards.___
+
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install -U -r requirements_flax.txt
+```
+
+
+### Training without prior preservation loss
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export INSTANCE_DIR="path-to-instance-images"
+export OUTPUT_DIR="path-to-save-model"
+
+python train_dreambooth_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --learning_rate=5e-6 \
+  --max_train_steps=400
+```
+
+
+### Training with prior preservation loss
 
 ```bash
 export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
@@ -244,7 +255,6 @@ export OUTPUT_DIR="path-to-save-model"
 
 python train_dreambooth_flax.py \
   --pretrained_model_name_or_path=$MODEL_NAME  \
-  --train_text_encoder \
   --instance_data_dir=$INSTANCE_DIR \
   --class_data_dir=$CLASS_DIR \
   --output_dir=$OUTPUT_DIR \
@@ -253,24 +263,32 @@ python train_dreambooth_flax.py \
   --class_prompt="a photo of dog" \
   --resolution=512 \
   --train_batch_size=1 \
-  --learning_rate=2e-6 \
+  --learning_rate=5e-6 \
   --num_class_images=200 \
   --max_train_steps=800
 ```
 
-## Inference
 
-Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `identifier`(e.g. sks in above example) in your prompt.
-
-```python
-from diffusers import StableDiffusionPipeline
-import torch
-
-model_id = "path-to-your-trained-model"
-pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+### Fine-tune text encoder with the UNet.
 
-prompt = "A photo of sks dog in a bucket"
-image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export INSTANCE_DIR="path-to-instance-images"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
 
-image.save("dog-bucket.png")
-```
+python train_dreambooth_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --train_text_encoder \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --learning_rate=2e-6 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
\ No newline at end of file
diff --git a/examples/dreambooth/requirements_flax.txt b/examples/dreambooth/requirements_flax.txt
new file mode 100644
index 000000000000..a1adc6632f4c
--- /dev/null
+++ b/examples/dreambooth/requirements_flax.txt
@@ -0,0 +1,9 @@
+diffusers>==0.5.1
+transformers>=4.21.0
+flax
+optax
+torch
+torchvision
+ftfy
+tensorboard
+modelcards
\ No newline at end of file
diff --git a/examples/text_to_image/README.md b/examples/text_to_image/README.md
index 64385ddae0b1..f7c8cdcb4ccb 100644
--- a/examples/text_to_image/README.md
+++ b/examples/text_to_image/README.md
@@ -7,7 +7,7 @@ ___Note___:
 ___This script is experimental. The script fine-tunes the whole model and often times the model overifits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparamters to get the best result on your dataset.___
 
 
-## Running locally 
+## Running locally with PyTorch
 ### Installing the dependencies
 
 Before running the scripts, make sure to install the library's training dependencies:
@@ -62,24 +62,6 @@ accelerate launch train_text_to_image.py \
   --output_dir="sd-pokemon-model" 
 ```
 
-Or use the Flax implementation if you need a speedup
-
-```bash
-export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
-export dataset_name="lambdalabs/pokemon-blip-captions"
-
-python train_text_to_image_flax.py \
-  --pretrained_model_name_or_path=$MODEL_NAME \
-  --dataset_name=$dataset_name \
-  --resolution=512 --center_crop --random_flip \
-  --train_batch_size=1 \
-  --mixed_precision="fp16" \
-  --max_train_steps=15000 \
-  --learning_rate=1e-05 \
-  --max_grad_norm=1 \
-  --output_dir="sd-pokemon-model" 
-```
-
 
 To run on your own training files prepare the dataset according to the format required by `datasets`, you can find the instructions for how to do that in this [document](https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder-with-metadata).
 If you wish to use custom loading logic, you should modify the script, we have left pointers for that in the training script.
@@ -104,34 +86,68 @@ accelerate launch train_text_to_image.py \
   --output_dir="sd-pokemon-model"
 ```
 
-Or use the Flax implementation if you need a speedup
+
+Once the training is finished the model will be saved in the `output_dir` specified in the command. In this example it's `sd-pokemon-model`. To load the fine-tuned model for inference just pass that path to `StableDiffusionPipeline`
+
+
+```python
+from diffusers import StableDiffusionPipeline
+
+model_path = "path_to_saved_model"
+pipe = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16)
+pipe.to("cuda")
+
+image = pipe(prompt="yoda").images[0]
+image.save("yoda-pokemon.png")
+```
+
+
+
+## Training with Flax/JAX
+
+For faster training on TPUs and GPUs you can leverage the flax training example. Follow the instructions above to get the model and dataset before running the script.
+
+____Note: The flax example don't yet support features like gradient checkpoint, gradient accumulation etc, so to use flax for faster training we will need >30GB cards.___
+
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install -U -r requirements_flax.txt
+```
 
 ```bash
 export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
-export TRAIN_DIR="path_to_your_dataset"
+export dataset_name="lambdalabs/pokemon-blip-captions"
 
 python train_text_to_image_flax.py \
   --pretrained_model_name_or_path=$MODEL_NAME \
-  --train_data_dir=$TRAIN_DIR \
+  --dataset_name=$dataset_name \
   --resolution=512 --center_crop --random_flip \
   --train_batch_size=1 \
   --mixed_precision="fp16" \
   --max_train_steps=15000 \
   --learning_rate=1e-05 \
   --max_grad_norm=1 \
-  --output_dir="sd-pokemon-model"
+  --output_dir="sd-pokemon-model" 
 ```
 
-Once the training is finished the model will be saved in the `output_dir` specified in the command. In this example it's `sd-pokemon-model`. To load the fine-tuned model for inference just pass that path to `StableDiffusionPipeline`
-
 
-```python
-from diffusers import StableDiffusionPipeline
+To run on your own training files prepare the dataset according to the format required by `datasets`, you can find the instructions for how to do that in this [document](https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder-with-metadata).
+If you wish to use custom loading logic, you should modify the script, we have left pointers for that in the training script.
 
-model_path = "path_to_saved_model"
-pipe = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16)
-pipe.to("cuda")
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export TRAIN_DIR="path_to_your_dataset"
 
-image = pipe(prompt="yoda").images[0]
-image.save("yoda-pokemon.png")
+python train_text_to_image_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$TRAIN_DIR \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --mixed_precision="fp16" \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --output_dir="sd-pokemon-model"
 ```
diff --git a/examples/text_to_image/requirements_flax.txt b/examples/text_to_image/requirements_flax.txt
new file mode 100644
index 000000000000..a1adc6632f4c
--- /dev/null
+++ b/examples/text_to_image/requirements_flax.txt
@@ -0,0 +1,9 @@
+diffusers>==0.5.1
+transformers>=4.21.0
+flax
+optax
+torch
+torchvision
+ftfy
+tensorboard
+modelcards
\ No newline at end of file
diff --git a/examples/textual_inversion/README.md b/examples/textual_inversion/README.md
index 969ac722c14f..9f3aa671456e 100644
--- a/examples/textual_inversion/README.md
+++ b/examples/textual_inversion/README.md
@@ -11,7 +11,7 @@ Colab for training
 Colab for inference
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_conceptualizer_inference.ipynb)
 
-## Running locally 
+## Running locally with PyTorch
 ### Installing the dependencies
 
 Before running the scripts, make sure to install the library's training dependencies:
@@ -68,25 +68,6 @@ accelerate launch textual_inversion.py \
 
 A full training run takes ~1 hour on one V100 GPU.
 
-If you want to speed it up even more, Flax implementation is available:
-
-```bash
-export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
-export DATA_DIR="path-to-dir-containing-images"
-
-python textual_inversion_flax.py \
-  --pretrained_model_name_or_path=$MODEL_NAME \
-  --train_data_dir=$DATA_DIR \
-  --learnable_property="object" \
-  --placeholder_token="<cat-toy>" --initializer_token="toy" \
-  --resolution=512 \
-  --train_batch_size=1 \
-  --max_train_steps=3000 \
-  --learning_rate=5.0e-04 --scale_lr \
-  --output_dir="textual_inversion_cat"
-```
-It should be at least 70% faster than the PyTorch script with the same configuration.
-
 ### Inference
 
 Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `placeholder_token` in your prompt.
@@ -103,3 +84,31 @@ image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
 
 image.save("cat-backpack.png")
 ```
+
+
+## Training with Flax/JAX
+
+For faster training on TPUs and GPUs you can leverage the flax training example. Follow the instructions above to get the model and dataset before running the script.
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install -U -r requirements_flax.txt
+```
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export DATA_DIR="path-to-dir-containing-images"
+
+python textual_inversion_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$DATA_DIR \
+  --learnable_property="object" \
+  --placeholder_token="<cat-toy>" --initializer_token="toy" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --max_train_steps=3000 \
+  --learning_rate=5.0e-04 --scale_lr \
+  --output_dir="textual_inversion_cat"
+```
+It should be at least 70% faster than the PyTorch script with the same configuration.
\ No newline at end of file
diff --git a/examples/textual_inversion/requirements_flax.txt b/examples/textual_inversion/requirements_flax.txt
new file mode 100644
index 000000000000..a1adc6632f4c
--- /dev/null
+++ b/examples/textual_inversion/requirements_flax.txt
@@ -0,0 +1,9 @@
+diffusers>==0.5.1
+transformers>=4.21.0
+flax
+optax
+torch
+torchvision
+ftfy
+tensorboard
+modelcards
\ No newline at end of file

From eceeebdf9147fa8daa6acaa69088def0f60315a2 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Thu, 27 Oct 2022 15:51:11 +0200
Subject: [PATCH 066/269] Update train_dreambooth.py

---
 examples/dreambooth/train_dreambooth.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index 9b7e17241db0..8b0b20e07bb5 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -26,7 +26,7 @@
 logger = get_logger(__name__)
 
 
-def parse_args(input_args: None):
+def parse_args(input_args=None):
     parser = argparse.ArgumentParser(description="Simple example of a training script.")
     parser.add_argument(
         "--pretrained_model_name_or_path",

From 939ec17e91432531b8acb7ea0ae9295936ff1da6 Mon Sep 17 00:00:00 2001
From: Denis <lukovnikov.denis@gmail.com>
Date: Thu, 27 Oct 2022 15:55:18 +0200
Subject: [PATCH 067/269] Probably nicer to specify dependency on tensorboard
 in the training example (#998)

tensorboard import in readme, otherwise accelerator.trackers[0] out of range

Co-authored-by: lukovnikov <lukovnikov@users.noreply.github.com>
---
 examples/unconditional_image_generation/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/unconditional_image_generation/README.md b/examples/unconditional_image_generation/README.md
index a2a5a840af68..e9c461b482de 100644
--- a/examples/unconditional_image_generation/README.md
+++ b/examples/unconditional_image_generation/README.md
@@ -7,7 +7,7 @@ Creating a training image set is [described in a different document](https://hug
 Before running the scripts, make sure to install the library's training dependencies:
 
 ```bash
-pip install diffusers[training] accelerate datasets
+pip install diffusers[training] accelerate datasets tensorboard
 ```
 
 And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:

From a6314a8d4e1c301bce4e45c10f325f594220617f Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Thu, 27 Oct 2022 15:55:36 +0200
Subject: [PATCH 068/269] Add `--dataloader_num_workers` to the DDPM training
 example (#1027)

---
 .../train_unconditional.py                        | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py
index 6a1495b34c9f..2bc8114caca8 100644
--- a/examples/unconditional_image_generation/train_unconditional.py
+++ b/examples/unconditional_image_generation/train_unconditional.py
@@ -83,7 +83,16 @@ def parse_args():
         "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
     )
     parser.add_argument(
-        "--eval_batch_size", type=int, default=16, help="Batch size (per device) for the eval dataloader."
+        "--eval_batch_size", type=int, default=16, help="The number of images to generate for evaluation."
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "The number of subprocesses to use for data loading. 0 means that the data will be loaded in the main"
+            " process."
+        ),
     )
     parser.add_argument("--num_epochs", type=int, default=100)
     parser.add_argument("--save_images_epochs", type=int, default=10, help="How often to save images during training.")
@@ -249,7 +258,9 @@ def transforms(examples):
         return {"input": images}
 
     dataset.set_transform(transforms)
-    train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.train_batch_size, shuffle=True)
+    train_dataloader = torch.utils.data.DataLoader(
+        dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers
+    )
 
     lr_scheduler = get_scheduler(
         args.lr_scheduler,

From de00c632174704d6d92405120f171e58b38e0ee8 Mon Sep 17 00:00:00 2001
From: Pi Esposito <piero.skywalker@gmail.com>
Date: Thu, 27 Oct 2022 11:52:21 -0300
Subject: [PATCH 069/269] Document sequential CPU offload method on Stable
 Diffusion pipeline (#1024)

* document cpu offloading method

* address review comments

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/optimization/fp16.mdx             | 64 +++++++++++++++++--
 .../pipeline_stable_diffusion.py              |  5 ++
 2 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/docs/source/optimization/fp16.mdx b/docs/source/optimization/fp16.mdx
index d1dd87f7652f..b4ffed31c76b 100644
--- a/docs/source/optimization/fp16.mdx
+++ b/docs/source/optimization/fp16.mdx
@@ -14,9 +14,8 @@ specific language governing permissions and limitations under the License.
 
 We present some techniques and ideas to optimize 🤗 Diffusers _inference_ for memory or speed.
 
-
 |                  | Latency | Speedup |
-|------------------|---------|---------|
+| ---------------- | ------- | ------- |
 | original         | 9.50s   | x1      |
 | cuDNN auto-tuner | 9.37s   | x1.01   |
 | autocast (fp16)  | 5.47s   | x1.91   |
@@ -24,7 +23,11 @@ We present some techniques and ideas to optimize 🤗 Diffusers _inference_ for
 | channels last    | 3.30s   | x2.87   |
 | traced UNet      | 3.21s   | x2.96   |
 
-<em>obtained on NVIDIA TITAN RTX by generating a single image of size 512x512 from the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM steps.</em>
+<em>
+  obtained on NVIDIA TITAN RTX by generating a single image of size 512x512 from
+  the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM
+  steps.
+</em>
 
 ## Enable cuDNN auto-tuner
 
@@ -61,7 +64,7 @@ pipe = pipe.to("cuda")
 
 prompt = "a photo of an astronaut riding a horse on mars"
 with autocast("cuda"):
-    image = pipe(prompt).images[0]  
+    image = pipe(prompt).images[0]
 ```
 
 Despite the precision loss, in our experience the final image results look the same as the `float32` versions. Feel free to experiment and report back!
@@ -79,7 +82,7 @@ pipe = StableDiffusionPipeline.from_pretrained(
 pipe = pipe.to("cuda")
 
 prompt = "a photo of an astronaut riding a horse on mars"
-image = pipe(prompt).images[0]  
+image = pipe(prompt).images[0]
 ```
 
 ## Sliced attention for additional memory savings
@@ -87,7 +90,10 @@ image = pipe(prompt).images[0]
 For even additional memory savings, you can use a sliced version of attention that performs the computation in steps instead of all at once.
 
 <Tip>
-Attention slicing is useful even if a batch size of just 1 is used - as long as the model uses more than one attention head. If there is more than one attention head the *QK^T* attention matrix can be computed sequentially for each head which can save a significant amount of memory.
+  Attention slicing is useful even if a batch size of just 1 is used - as long
+  as the model uses more than one attention head. If there is more than one
+  attention head the *QK^T* attention matrix can be computed sequentially for
+  each head which can save a significant amount of memory.
 </Tip>
 
 To perform the attention computation sequentially over each head, you only need to invoke [`~StableDiffusionPipeline.enable_attention_slicing`] in your pipeline before inference, like here:
@@ -105,11 +111,55 @@ pipe = pipe.to("cuda")
 
 prompt = "a photo of an astronaut riding a horse on mars"
 pipe.enable_attention_slicing()
-image = pipe(prompt).images[0]  
+image = pipe(prompt).images[0]
 ```
 
 There's a small performance penalty of about 10% slower inference times, but this method allows you to use Stable Diffusion in as little as 3.2 GB of VRAM!
 
+## Offloading to CPU with accelerate for memory savings
+
+For additional memory savings, you can offload the weights to CPU and load them to GPU when performing the forward pass.
+
+To perform CPU offloading, all you have to do is invoke [`~StableDiffusionPipeline.enable_sequential_cpu_offload`]:
+
+```Python
+import torch
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    revision="fp16",
+    torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_sequential_cpu_offload()
+image = pipe(prompt).images[0]
+```
+
+And you can get the memory consumption to < 2GB.
+
+If is also possible to chain it with attention slicing for minimal memory consumption, running it in as little as < 800mb of GPU vRAM:
+
+```Python
+import torch
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    revision="fp16",
+    torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_sequential_cpu_offload()
+pipe.enable_attention_slicing(1)
+
+image = pipe(prompt).images[0]
+```
+
 ## Using Channels Last memory format
 
 Channels last memory format is an alternative way of ordering NCHW tensors in memory preserving dimensions ordering. Channels last tensors ordered in such a way that channels become the densest dimension (aka storing images pixel-per-pixel). Since not all operators currently support channels last format it may result in a worst performance, so it's better to try it and see if it works for your model.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index cca11281359a..e80cc1360b33 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -120,6 +120,11 @@ def disable_attention_slicing(self):
         self.enable_attention_slicing(None)
 
     def enable_sequential_cpu_offload(self):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
         if is_accelerate_available():
             from accelerate import cpu_offload
         else:

From fb38bb1621976e6695ee8f8b2e95ced877b69c3b Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Thu, 27 Oct 2022 22:44:35 +0200
Subject: [PATCH 070/269] Support grayscale images in `numpy_to_pil` (#1025)

---
 src/diffusers/pipeline_flax_utils.py | 6 +++++-
 src/diffusers/pipeline_utils.py      | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipeline_flax_utils.py b/src/diffusers/pipeline_flax_utils.py
index 3c23693b40ef..e96c0c7467f3 100644
--- a/src/diffusers/pipeline_flax_utils.py
+++ b/src/diffusers/pipeline_flax_utils.py
@@ -444,7 +444,11 @@ def numpy_to_pil(images):
         if images.ndim == 3:
             images = images[None, ...]
         images = (images * 255).round().astype("uint8")
-        pil_images = [Image.fromarray(image) for image in images]
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image) for image in images]
 
         return pil_images
 
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index c9c58a748831..c0a44363a2f9 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -625,7 +625,11 @@ def numpy_to_pil(images):
         if images.ndim == 3:
             images = images[None, ...]
         images = (images * 255).round().astype("uint8")
-        pil_images = [Image.fromarray(image) for image in images]
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image) for image in images]
 
         return pil_images
 

From 1e07b6b334c0b94e06ca9860641bd2e16aff90fb Mon Sep 17 00:00:00 2001
From: "Duong A. Nguyen" <38061659+duongna21@users.noreply.github.com>
Date: Fri, 28 Oct 2022 16:21:34 +0700
Subject: [PATCH 071/269] [Flax SD finetune] Fix dtype (#1038)

fix jnp dtype
---
 examples/text_to_image/train_text_to_image_flax.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py
index ab8d2b7ee2a3..cacfacef498b 100644
--- a/examples/text_to_image/train_text_to_image_flax.py
+++ b/examples/text_to_image/train_text_to_image_flax.py
@@ -371,11 +371,11 @@ def collate_fn(examples):
         train_dataset, shuffle=True, collate_fn=collate_fn, batch_size=total_train_batch_size, drop_last=True
     )
 
-    weight_dtype = torch.float32
+    weight_dtype = jnp.float32
     if args.mixed_precision == "fp16":
-        weight_dtype = torch.float16
+        weight_dtype = jnp.float16
     elif args.mixed_precision == "bf16":
-        weight_dtype = torch.bfloat16
+        weight_dtype = jnp.bfloat16
 
     # Load models and create wrapper for stable diffusion
     tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")

From ab079f27cf8c8c570d82711d209196e8240d9dd8 Mon Sep 17 00:00:00 2001
From: Nouamane Tazi <nouamane98@gmail.com>
Date: Fri, 28 Oct 2022 10:25:21 +0100
Subject: [PATCH 072/269] fix `F.interpolate()` for large batch sizes (#1006)

* fix `upsample_nearest_nhwc` for large bsz

* fix `upsample_nearest_nhwc` for large bsz
---
 src/diffusers/models/resnet.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index fbd78b512a6b..7bb5416adf24 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -48,6 +48,10 @@ def forward(self, hidden_states, output_size=None):
         if dtype == torch.bfloat16:
             hidden_states = hidden_states.to(torch.float32)
 
+        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+        if hidden_states.shape[0] >= 64:
+            hidden_states = hidden_states.contiguous()
+
         # if `output_size` is passed we force the interpolation output
         # size and do not make use of `scale_factor=2`
         if output_size is None:
@@ -376,6 +380,10 @@ def forward(self, input_tensor, temb):
         hidden_states = self.nonlinearity(hidden_states)
 
         if self.upsample is not None:
+            # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+            if hidden_states.shape[0] >= 64:
+                input_tensor = input_tensor.contiguous()
+                hidden_states = hidden_states.contiguous()
             input_tensor = self.upsample(input_tensor)
             hidden_states = self.upsample(hidden_states)
         elif self.downsample is not None:

From a80480f0f2c6b7981297e62a4878a4515148f1ad Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 28 Oct 2022 13:43:26 +0200
Subject: [PATCH 073/269] [Tests] Improve unet / vae tests (#1018)

* improve tests

* up

* finish

* upload

* add init

* up

* finish vae

* finish

* reduce loading time with device_map

* remove device_map from CPU

* uP
---
 setup.py                                   |   2 +
 src/diffusers/dependency_versions_table.py |   1 +
 src/diffusers/utils/__init__.py            |   2 +-
 tests/models/__init__.py                   |   0
 tests/{ => models}/test_models_unet_1d.py  |   2 +-
 tests/{ => models}/test_models_unet_2d.py  | 248 +++++++++++++----
 tests/models/test_models_vae.py            | 303 +++++++++++++++++++++
 tests/{ => models}/test_models_vae_flax.py |   2 +-
 tests/{ => models}/test_models_vq.py       |   2 +-
 tests/test_models_vae.py                   | 132 ---------
 10 files changed, 506 insertions(+), 188 deletions(-)
 create mode 100644 tests/models/__init__.py
 rename tests/{ => models}/test_models_unet_1d.py (98%)
 rename tests/{ => models}/test_models_unet_2d.py (60%)
 create mode 100644 tests/models/test_models_vae.py
 rename tests/{ => models}/test_models_vae_flax.py (94%)
 rename tests/{ => models}/test_models_vq.py (98%)
 delete mode 100644 tests/test_models_vae.py

diff --git a/setup.py b/setup.py
index b4c1613ba115..6f0742e83ee5 100644
--- a/setup.py
+++ b/setup.py
@@ -94,6 +94,7 @@
     "modelcards>=0.1.4",
     "numpy",
     "onnxruntime",
+    "parameterized",
     "pytest",
     "pytest-timeout",
     "pytest-xdist",
@@ -181,6 +182,7 @@ def run(self):
     "accelerate",
     "datasets",
     "onnxruntime",
+    "parameterized",
     "pytest",
     "pytest-timeout",
     "pytest-xdist",
diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py
index 8b10d70a26f7..64e55e932c1c 100644
--- a/src/diffusers/dependency_versions_table.py
+++ b/src/diffusers/dependency_versions_table.py
@@ -18,6 +18,7 @@
     "modelcards": "modelcards>=0.1.4",
     "numpy": "numpy",
     "onnxruntime": "onnxruntime",
+    "parameterized": "parameterized",
     "pytest": "pytest",
     "pytest-timeout": "pytest-timeout",
     "pytest-xdist": "pytest-xdist",
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index 51798e2ad765..ae00c1cf6a10 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -40,7 +40,7 @@
 
 
 if is_torch_available():
-    from .testing_utils import floats_tensor, load_image, parse_flag_from_env, slow, torch_device
+    from .testing_utils import floats_tensor, load_image, parse_flag_from_env, require_torch_gpu, slow, torch_device
 
 
 logger = get_logger(__name__)
diff --git a/tests/models/__init__.py b/tests/models/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/test_models_unet_1d.py b/tests/models/test_models_unet_1d.py
similarity index 98%
rename from tests/test_models_unet_1d.py
rename to tests/models/test_models_unet_1d.py
index c274ce419211..286c7525e2ca 100644
--- a/tests/test_models_unet_1d.py
+++ b/tests/models/test_models_unet_1d.py
@@ -28,7 +28,7 @@ class UnetModel1DTests(unittest.TestCase):
     @slow
     def test_unet_1d_maestro(self):
         model_id = "harmonai/maestro-150k"
-        model = UNet1DModel.from_pretrained(model_id, subfolder="unet")
+        model = UNet1DModel.from_pretrained(model_id, subfolder="unet", device_map="auto")
         model.to(torch_device)
 
         sample_size = 65536
diff --git a/tests/test_models_unet_2d.py b/tests/models/test_models_unet_2d.py
similarity index 60%
rename from tests/test_models_unet_2d.py
rename to tests/models/test_models_unet_2d.py
index 393d47887e6d..b2aa90b236d0 100644
--- a/tests/test_models_unet_2d.py
+++ b/tests/models/test_models_unet_2d.py
@@ -21,9 +21,10 @@
 import torch
 
 from diffusers import UNet2DConditionModel, UNet2DModel
-from diffusers.utils import floats_tensor, slow, torch_device
+from diffusers.utils import floats_tensor, require_torch_gpu, slow, torch_device
+from parameterized import parameterized
 
-from .test_modeling_common import ModelTesterMixin
+from ..test_modeling_common import ModelTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
@@ -66,28 +67,6 @@ def prepare_init_args_and_inputs_for_common(self):
         return init_dict, inputs_dict
 
 
-#    TODO(Patrick) - Re-add this test after having correctly added the final VE checkpoints
-#    def test_output_pretrained(self):
-#        model = UNet2DModel.from_pretrained("fusing/ddpm_dummy_update", subfolder="unet")
-#        model.eval()
-#
-#        torch.manual_seed(0)
-#        if torch.cuda.is_available():
-#            torch.cuda.manual_seed_all(0)
-#
-#        noise = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
-#        time_step = torch.tensor([10])
-#
-#        with torch.no_grad():
-#            output = model(noise, time_step).sample
-#
-#        output_slice = output[0, -1, -3:, -3:].flatten()
-# fmt: off
-#        expected_output_slice = torch.tensor([0.2891, -0.1899, 0.2595, -0.6214, 0.0968, -0.2622, 0.4688, 0.1311, 0.0053])
-# fmt: on
-#        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2))
-
-
 class UNetLDMModelTests(ModelTesterMixin, unittest.TestCase):
     model_class = UNet2DModel
 
@@ -170,7 +149,9 @@ def test_from_pretrained_accelerate_wont_change_results(self):
         torch.cuda.empty_cache()
         gc.collect()
 
-        model_normal_load, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
+        model_normal_load, _ = UNet2DModel.from_pretrained(
+            "fusing/unet-ldm-dummy-update", output_loading_info=True, device_map="auto"
+        )
         model_normal_load.to(torch_device)
         model_normal_load.eval()
         arr_normal_load = model_normal_load(noise, time_step)["sample"]
@@ -309,31 +290,6 @@ def test_gradient_checkpointing(self):
             self.assertTrue(torch.allclose(param.grad.data, named_params_2[name].grad.data, atol=5e-5))
 
 
-#    TODO(Patrick) - Re-add this test after having cleaned up LDM
-#    def test_output_pretrained_spatial_transformer(self):
-#        model = UNetLDMModel.from_pretrained("fusing/unet-ldm-dummy-spatial")
-#        model.eval()
-#
-#        torch.manual_seed(0)
-#        if torch.cuda.is_available():
-#            torch.cuda.manual_seed_all(0)
-#
-#        noise = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
-#        context = torch.ones((1, 16, 64), dtype=torch.float32)
-#        time_step = torch.tensor([10] * noise.shape[0])
-#
-#        with torch.no_grad():
-#            output = model(noise, time_step, context=context)
-#
-#        output_slice = output[0, -1, -3:, -3:].flatten()
-# fmt: off
-#        expected_output_slice = torch.tensor([61.3445, 56.9005, 29.4339, 59.5497, 60.7375, 34.1719, 48.1951, 42.6569, 25.0890])
-# fmt: on
-#
-#        self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
-#
-
-
 class NCSNppModelTests(ModelTesterMixin, unittest.TestCase):
     model_class = UNet2DModel
 
@@ -383,7 +339,9 @@ def prepare_init_args_and_inputs_for_common(self):
 
     @slow
     def test_from_pretrained_hub(self):
-        model, loading_info = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256", output_loading_info=True)
+        model, loading_info = UNet2DModel.from_pretrained(
+            "google/ncsnpp-celebahq-256", output_loading_info=True, device_map="auto"
+        )
         self.assertIsNotNone(model)
         self.assertEqual(len(loading_info["missing_keys"]), 0)
 
@@ -397,7 +355,7 @@ def test_from_pretrained_hub(self):
 
     @slow
     def test_output_pretrained_ve_mid(self):
-        model = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256")
+        model = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256", device_map="auto")
         model.to(torch_device)
 
         torch.manual_seed(0)
@@ -449,3 +407,189 @@ def test_output_pretrained_ve_large(self):
     def test_forward_with_norm_groups(self):
         # not required for this model
         pass
+
+
+@slow
+class UNet2DConditionModelIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_latents(self, seed=0, shape=(4, 4, 64, 64), fp16=False):
+        batch_size, channels, height, width = shape
+        generator = torch.Generator(device=torch_device).manual_seed(seed)
+        dtype = torch.float16 if fp16 else torch.float32
+        image = torch.randn(batch_size, channels, height, width, device=torch_device, generator=generator, dtype=dtype)
+
+        return image
+
+    def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"):
+        revision = "fp16" if fp16 else None
+        torch_dtype = torch.float16 if fp16 else torch.float32
+
+        model = UNet2DConditionModel.from_pretrained(
+            model_id, subfolder="unet", torch_dtype=torch_dtype, revision=revision, device_map="auto"
+        )
+        model.to(torch_device).eval()
+
+        return model
+
+    def get_encoder_hidden_states(self, seed=0, shape=(4, 77, 768), fp16=False):
+        generator = torch.Generator(device=torch_device).manual_seed(seed)
+        dtype = torch.float16 if fp16 else torch.float32
+        return torch.randn(shape, device=torch_device, generator=generator, dtype=dtype)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, 4, [-0.4424, 0.1510, -0.1937, 0.2118, 0.3746, -0.3957, 0.0160, -0.0435]],
+            [47, 0.55, [-0.1508, 0.0379, -0.3075, 0.2540, 0.3633, -0.0821, 0.1719, -0.0207]],
+            [21, 0.89, [-0.6479, 0.6364, -0.3464, 0.8697, 0.4443, -0.6289, -0.0091, 0.1778]],
+            [9, 1000, [0.8888, -0.5659, 0.5834, -0.7469, 1.1912, -0.3923, 1.1241, -0.4424]],
+            # fmt: on
+        ]
+    )
+    def test_compvis_sd_v1_4(self, seed, timestep, expected_slice):
+        model = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4")
+        latents = self.get_latents(seed)
+        encoder_hidden_states = self.get_encoder_hidden_states(seed)
+
+        with torch.no_grad():
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
+
+        assert sample.shape == latents.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [83, 4, [-0.2323, -0.1304, 0.0813, -0.3093, -0.0919, -0.1571, -0.1125, -0.5806]],
+            [17, 0.55, [-0.0831, -0.2443, 0.0901, -0.0919, 0.3396, 0.0103, -0.3743, 0.0701]],
+            [8, 0.89, [-0.4863, 0.0859, 0.0875, -0.1658, 0.9199, -0.0114, 0.4839, 0.4639]],
+            [3, 1000, [-0.5649, 0.2402, -0.5518, 0.1248, 1.1328, -0.2443, -0.0325, -1.0078]],
+            # fmt: on
+        ]
+    )
+    @require_torch_gpu
+    def test_compvis_sd_v1_4_fp16(self, seed, timestep, expected_slice):
+        model = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4", fp16=True)
+        latents = self.get_latents(seed, fp16=True)
+        encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
+
+        with torch.no_grad():
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
+
+        assert sample.shape == latents.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, 4, [-0.4430, 0.1570, -0.1867, 0.2376, 0.3205, -0.3681, 0.0525, -0.0722]],
+            [47, 0.55, [-0.1415, 0.0129, -0.3136, 0.2257, 0.3430, -0.0536, 0.2114, -0.0436]],
+            [21, 0.89, [-0.7091, 0.6664, -0.3643, 0.9032, 0.4499, -0.6541, 0.0139, 0.1750]],
+            [9, 1000, [0.8878, -0.5659, 0.5844, -0.7442, 1.1883, -0.3927, 1.1192, -0.4423]],
+            # fmt: on
+        ]
+    )
+    def test_compvis_sd_v1_5(self, seed, timestep, expected_slice):
+        model = self.get_unet_model(model_id="runwayml/stable-diffusion-v1-5")
+        latents = self.get_latents(seed)
+        encoder_hidden_states = self.get_encoder_hidden_states(seed)
+
+        with torch.no_grad():
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
+
+        assert sample.shape == latents.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [83, 4, [-0.2695, -0.1669, 0.0073, -0.3181, -0.1187, -0.1676, -0.1395, -0.5972]],
+            [17, 0.55, [-0.1290, -0.2588, 0.0551, -0.0916, 0.3286, 0.0238, -0.3669, 0.0322]],
+            [8, 0.89, [-0.5283, 0.1198, 0.0870, -0.1141, 0.9189, -0.0150, 0.5474, 0.4319]],
+            [3, 1000, [-0.5601, 0.2411, -0.5435, 0.1268, 1.1338, -0.2427, -0.0280, -1.0020]],
+            # fmt: on
+        ]
+    )
+    @require_torch_gpu
+    def test_compvis_sd_v1_5_fp16(self, seed, timestep, expected_slice):
+        model = self.get_unet_model(model_id="runwayml/stable-diffusion-v1-5", fp16=True)
+        latents = self.get_latents(seed, fp16=True)
+        encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
+
+        with torch.no_grad():
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
+
+        assert sample.shape == latents.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, 4, [-0.7639, 0.0106, -0.1615, -0.3487, -0.0423, -0.7972, 0.0085, -0.4858]],
+            [47, 0.55, [-0.6564, 0.0795, -1.9026, -0.6258, 1.8235, 1.2056, 1.2169, 0.9073]],
+            [21, 0.89, [0.0327, 0.4399, -0.6358, 0.3417, 0.4120, -0.5621, -0.0397, -1.0430]],
+            [9, 1000, [0.1600, 0.7303, -1.0556, -0.3515, -0.7440, -1.2037, -1.8149, -1.8931]],
+            # fmt: on
+        ]
+    )
+    def test_compvis_sd_inpaint(self, seed, timestep, expected_slice):
+        model = self.get_unet_model(model_id="runwayml/stable-diffusion-inpainting")
+        latents = self.get_latents(seed, shape=(4, 9, 64, 64))
+        encoder_hidden_states = self.get_encoder_hidden_states(seed)
+
+        with torch.no_grad():
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
+
+        assert sample.shape == (4, 4, 64, 64)
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [83, 4, [-0.1047, -1.7227, 0.1067, 0.0164, -0.5698, -0.4172, -0.1388, 1.1387]],
+            [17, 0.55, [0.0975, -0.2856, -0.3508, -0.4600, 0.3376, 0.2930, -0.2747, -0.7026]],
+            [8, 0.89, [-0.0952, 0.0183, -0.5825, -0.1981, 0.1131, 0.4668, -0.0395, -0.3486]],
+            [3, 1000, [0.4790, 0.4949, -1.0732, -0.7158, 0.7959, -0.9478, 0.1105, -0.9741]],
+            # fmt: on
+        ]
+    )
+    @require_torch_gpu
+    def test_compvis_sd_inpaint_fp16(self, seed, timestep, expected_slice):
+        model = self.get_unet_model(model_id="runwayml/stable-diffusion-inpainting", fp16=True)
+        latents = self.get_latents(seed, shape=(4, 9, 64, 64), fp16=True)
+        encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
+
+        with torch.no_grad():
+            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
+
+        assert sample.shape == (4, 4, 64, 64)
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py
new file mode 100644
index 000000000000..65fe81fd3379
--- /dev/null
+++ b/tests/models/test_models_vae.py
@@ -0,0 +1,303 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import torch
+
+from diffusers import AutoencoderKL
+from diffusers.modeling_utils import ModelMixin
+from diffusers.utils import floats_tensor, require_torch_gpu, slow, torch_device
+from parameterized import parameterized
+
+from ..test_modeling_common import ModelTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class AutoencoderKLTests(ModelTesterMixin, unittest.TestCase):
+    model_class = AutoencoderKL
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+
+        return {"sample": image}
+
+    @property
+    def input_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "block_out_channels": [32, 64],
+            "in_channels": 3,
+            "out_channels": 3,
+            "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            "latent_channels": 4,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_forward_signature(self):
+        pass
+
+    def test_training(self):
+        pass
+
+    def test_from_pretrained_hub(self):
+        model, loading_info = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy", output_loading_info=True)
+        self.assertIsNotNone(model)
+        self.assertEqual(len(loading_info["missing_keys"]), 0)
+
+        model.to(torch_device)
+        image = model(**self.dummy_input)
+
+        assert image is not None, "Make sure output is not None"
+
+    def test_output_pretrained(self):
+        model = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy")
+        model = model.to(torch_device)
+        model.eval()
+
+        # One-time warmup pass (see #372)
+        if torch_device == "mps" and isinstance(model, ModelMixin):
+            image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+            image = image.to(torch_device)
+            with torch.no_grad():
+                _ = model(image, sample_posterior=True).sample
+            generator = torch.manual_seed(0)
+        else:
+            generator = torch.Generator(device=torch_device).manual_seed(0)
+
+        image = torch.randn(
+            1,
+            model.config.in_channels,
+            model.config.sample_size,
+            model.config.sample_size,
+            generator=torch.manual_seed(0),
+        )
+        image = image.to(torch_device)
+        with torch.no_grad():
+            output = model(image, sample_posterior=True, generator=generator).sample
+
+        output_slice = output[0, -1, -3:, -3:].flatten().cpu()
+
+        # Since the VAE Gaussian prior's generator is seeded on the appropriate device,
+        # the expected output slices are not the same for CPU and GPU.
+        if torch_device == "mps":
+            expected_output_slice = torch.tensor(
+                [
+                    -4.0078e-01,
+                    -3.8323e-04,
+                    -1.2681e-01,
+                    -1.1462e-01,
+                    2.0095e-01,
+                    1.0893e-01,
+                    -8.8247e-02,
+                    -3.0361e-01,
+                    -9.8644e-03,
+                ]
+            )
+        elif torch_device == "cpu":
+            expected_output_slice = torch.tensor(
+                [-0.1352, 0.0878, 0.0419, -0.0818, -0.1069, 0.0688, -0.1458, -0.4446, -0.0026]
+            )
+        else:
+            expected_output_slice = torch.tensor(
+                [-0.2421, 0.4642, 0.2507, -0.0438, 0.0682, 0.3160, -0.2018, -0.0727, 0.2485]
+            )
+
+        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2))
+
+
+@slow
+class AutoencoderKLIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
+        batch_size, channels, height, width = shape
+        generator = torch.Generator(device=torch_device).manual_seed(seed)
+        dtype = torch.float16 if fp16 else torch.float32
+        image = torch.randn(batch_size, channels, height, width, device=torch_device, generator=generator, dtype=dtype)
+
+        return image
+
+    def get_sd_vae_model(self, model_id="CompVis/stable-diffusion-v1-4", fp16=False):
+        revision = "fp16" if fp16 else None
+        torch_dtype = torch.float16 if fp16 else torch.float32
+
+        model = AutoencoderKL.from_pretrained(
+            model_id, subfolder="vae", torch_dtype=torch_dtype, revision=revision, device_map="auto"
+        )
+        model.to(torch_device).eval()
+
+        return model
+
+    def get_generator(self, seed=0):
+        return torch.Generator(device=torch_device).manual_seed(seed)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, [-0.1603, 0.9878, -0.0495, -0.0790, -0.2709, 0.8375, -0.2060, -0.0824]],
+            [47, [-0.2376, 0.1168, 0.1332, -0.4840, -0.2508, -0.0791, -0.0493, -0.4089]],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion(self, seed, expected_slice):
+        model = self.get_sd_vae_model()
+        image = self.get_sd_image(seed)
+        generator = self.get_generator(seed)
+
+        with torch.no_grad():
+            sample = model(image, generator=generator, sample_posterior=True).sample
+
+        assert sample.shape == image.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, [-0.0513, 0.0289, 1.3799, 0.2166, -0.2573, -0.0871, 0.5103, -0.0999]],
+            [47, [-0.4128, -0.1320, -0.3704, 0.1965, -0.4116, -0.2332, -0.3340, 0.2247]],
+            # fmt: on
+        ]
+    )
+    @require_torch_gpu
+    def test_stable_diffusion_fp16(self, seed, expected_slice):
+        model = self.get_sd_vae_model(fp16=True)
+        image = self.get_sd_image(seed, fp16=True)
+        generator = self.get_generator(seed)
+
+        with torch.no_grad():
+            sample = model(image, generator=generator, sample_posterior=True).sample
+
+        assert sample.shape == image.shape
+
+        output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, [-0.1609, 0.9866, -0.0487, -0.0777, -0.2716, 0.8368, -0.2055, -0.0814]],
+            [47, [-0.2377, 0.1147, 0.1333, -0.4841, -0.2506, -0.0805, -0.0491, -0.4085]],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion_mode(self, seed, expected_slice):
+        model = self.get_sd_vae_model()
+        image = self.get_sd_image(seed)
+
+        with torch.no_grad():
+            sample = model(image).sample
+
+        assert sample.shape == image.shape
+
+        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [13, [-0.2051, -0.1803, -0.2311, -0.2114, -0.3292, -0.3574, -0.2953, -0.3323]],
+            [37, [-0.2632, -0.2625, -0.2199, -0.2741, -0.4539, -0.4990, -0.3720, -0.4925]],
+            # fmt: on
+        ]
+    )
+    @require_torch_gpu
+    def test_stable_diffusion_decode(self, seed, expected_slice):
+        model = self.get_sd_vae_model()
+        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64))
+
+        with torch.no_grad():
+            sample = model.decode(encoding).sample
+
+        assert list(sample.shape) == [3, 3, 512, 512]
+
+        output_slice = sample[-1, -2:, :2, -2:].flatten().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [27, [-0.0369, 0.0207, -0.0776, -0.0682, -0.1747, -0.1930, -0.1465, -0.2039]],
+            [16, [-0.1628, -0.2134, -0.2747, -0.2642, -0.3774, -0.4404, -0.3687, -0.4277]],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion_decode_fp16(self, seed, expected_slice):
+        model = self.get_sd_vae_model(fp16=True)
+        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64), fp16=True)
+
+        with torch.no_grad():
+            sample = model.decode(encoding).sample
+
+        assert list(sample.shape) == [3, 3, 512, 512]
+
+        output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+
+    @parameterized.expand(
+        [
+            # fmt: off
+            [33, [-0.3001, 0.0918, -2.6984, -3.9720, -3.2099, -5.0353, 1.7338, -0.2065, 3.4267]],
+            [47, [-1.5030, -4.3871, -6.0355, -9.1157, -1.6661, -2.7853, 2.1607, -5.0823, 2.5633]],
+            # fmt: on
+        ]
+    )
+    def test_stable_diffusion_encode_sample(self, seed, expected_slice):
+        model = self.get_sd_vae_model()
+        image = self.get_sd_image(seed)
+        generator = self.get_generator(seed)
+
+        with torch.no_grad():
+            dist = model.encode(image).latent_dist
+            sample = dist.sample(generator=generator)
+
+        assert list(sample.shape) == [image.shape[0], 4] + [i // 8 for i in image.shape[2:]]
+
+        output_slice = sample[0, -1, -3:, -3:].flatten().cpu()
+        expected_output_slice = torch.tensor(expected_slice)
+
+        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
diff --git a/tests/test_models_vae_flax.py b/tests/models/test_models_vae_flax.py
similarity index 94%
rename from tests/test_models_vae_flax.py
rename to tests/models/test_models_vae_flax.py
index e5c56b61a5a4..8fedb85eccfc 100644
--- a/tests/test_models_vae_flax.py
+++ b/tests/models/test_models_vae_flax.py
@@ -4,7 +4,7 @@
 from diffusers.utils import is_flax_available
 from diffusers.utils.testing_utils import require_flax
 
-from .test_modeling_common_flax import FlaxModelTesterMixin
+from ..test_modeling_common_flax import FlaxModelTesterMixin
 
 
 if is_flax_available():
diff --git a/tests/test_models_vq.py b/tests/models/test_models_vq.py
similarity index 98%
rename from tests/test_models_vq.py
rename to tests/models/test_models_vq.py
index 9a2094d46cb4..f58e90469885 100644
--- a/tests/test_models_vq.py
+++ b/tests/models/test_models_vq.py
@@ -20,7 +20,7 @@
 from diffusers import VQModel
 from diffusers.utils import floats_tensor, torch_device
 
-from .test_modeling_common import ModelTesterMixin
+from ..test_modeling_common import ModelTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
diff --git a/tests/test_models_vae.py b/tests/test_models_vae.py
deleted file mode 100644
index 49610f848364..000000000000
--- a/tests/test_models_vae.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import torch
-
-from diffusers import AutoencoderKL
-from diffusers.modeling_utils import ModelMixin
-from diffusers.utils import floats_tensor, torch_device
-
-from .test_modeling_common import ModelTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class AutoencoderKLTests(ModelTesterMixin, unittest.TestCase):
-    model_class = AutoencoderKL
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 3
-        sizes = (32, 32)
-
-        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
-
-        return {"sample": image}
-
-    @property
-    def input_shape(self):
-        return (3, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (3, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "block_out_channels": [32, 64],
-            "in_channels": 3,
-            "out_channels": 3,
-            "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            "latent_channels": 4,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_forward_signature(self):
-        pass
-
-    def test_training(self):
-        pass
-
-    def test_from_pretrained_hub(self):
-        model, loading_info = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy", output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
-
-        model.to(torch_device)
-        image = model(**self.dummy_input)
-
-        assert image is not None, "Make sure output is not None"
-
-    def test_output_pretrained(self):
-        model = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy")
-        model = model.to(torch_device)
-        model.eval()
-
-        # One-time warmup pass (see #372)
-        if torch_device == "mps" and isinstance(model, ModelMixin):
-            image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
-            image = image.to(torch_device)
-            with torch.no_grad():
-                _ = model(image, sample_posterior=True).sample
-            generator = torch.manual_seed(0)
-        else:
-            generator = torch.Generator(device=torch_device).manual_seed(0)
-
-        image = torch.randn(
-            1,
-            model.config.in_channels,
-            model.config.sample_size,
-            model.config.sample_size,
-            generator=torch.manual_seed(0),
-        )
-        image = image.to(torch_device)
-        with torch.no_grad():
-            output = model(image, sample_posterior=True, generator=generator).sample
-
-        output_slice = output[0, -1, -3:, -3:].flatten().cpu()
-
-        # Since the VAE Gaussian prior's generator is seeded on the appropriate device,
-        # the expected output slices are not the same for CPU and GPU.
-        if torch_device == "mps":
-            expected_output_slice = torch.tensor(
-                [
-                    -4.0078e-01,
-                    -3.8323e-04,
-                    -1.2681e-01,
-                    -1.1462e-01,
-                    2.0095e-01,
-                    1.0893e-01,
-                    -8.8247e-02,
-                    -3.0361e-01,
-                    -9.8644e-03,
-                ]
-            )
-        elif torch_device == "cpu":
-            expected_output_slice = torch.tensor(
-                [-0.1352, 0.0878, 0.0419, -0.0818, -0.1069, 0.0688, -0.1458, -0.4446, -0.0026]
-            )
-        else:
-            expected_output_slice = torch.tensor(
-                [-0.2421, 0.4642, 0.2507, -0.0438, 0.0682, 0.3160, -0.2018, -0.0727, 0.2485]
-            )
-
-        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2))

From d2d9764f35d925302764f7bf2233e6d1c750ea6b Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 28 Oct 2022 14:46:39 +0200
Subject: [PATCH 074/269] [Tests] Speed up slow tests (#1040)

* [Tests] Speed up slow tests

* Up

* up
---
 .../dance_diffusion/test_dance_diffusion.py   |  6 ++--
 tests/pipelines/ddim/test_ddim.py             |  4 +--
 tests/pipelines/ddpm/test_ddpm.py             |  2 +-
 tests/pipelines/karras_ve/test_karras_ve.py   |  2 +-
 .../latent_diffusion/test_latent_diffusion.py |  4 +--
 tests/pipelines/pndm/test_pndm.py             |  2 +-
 .../score_sde_ve/test_score_sde_ve.py         |  2 +-
 .../stable_diffusion/test_stable_diffusion.py | 25 ++++++++------
 .../test_stable_diffusion_img2img.py          |  4 ++-
 .../test_stable_diffusion_inpaint.py          |  6 +++-
 .../test_stable_diffusion_inpaint_legacy.py   |  4 ++-
 tests/test_pipelines.py                       | 33 +++++++++++--------
 12 files changed, 58 insertions(+), 36 deletions(-)

diff --git a/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
index 72e67e4479d2..737d1c57d154 100644
--- a/tests/pipelines/dance_diffusion/test_dance_diffusion.py
+++ b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
@@ -86,7 +86,7 @@ def tearDown(self):
     def test_dance_diffusion(self):
         device = torch_device
 
-        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k")
+        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k", device_map="auto")
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
 
@@ -103,7 +103,9 @@ def test_dance_diffusion(self):
     def test_dance_diffusion_fp16(self):
         device = torch_device
 
-        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k", torch_dtype=torch.float16)
+        pipe = DanceDiffusionPipeline.from_pretrained(
+            "harmonai/maestro-150k", torch_dtype=torch.float16, device_map="auto"
+        )
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
 
diff --git a/tests/pipelines/ddim/test_ddim.py b/tests/pipelines/ddim/test_ddim.py
index 4445fe7feecf..41280a9bb4f0 100644
--- a/tests/pipelines/ddim/test_ddim.py
+++ b/tests/pipelines/ddim/test_ddim.py
@@ -78,7 +78,7 @@ class DDIMPipelineIntegrationTests(unittest.TestCase):
     def test_inference_ema_bedroom(self):
         model_id = "google/ddpm-ema-bedroom-256"
 
-        unet = UNet2DModel.from_pretrained(model_id)
+        unet = UNet2DModel.from_pretrained(model_id, device_map="auto")
         scheduler = DDIMScheduler.from_config(model_id)
 
         ddpm = DDIMPipeline(unet=unet, scheduler=scheduler)
@@ -97,7 +97,7 @@ def test_inference_ema_bedroom(self):
     def test_inference_cifar10(self):
         model_id = "google/ddpm-cifar10-32"
 
-        unet = UNet2DModel.from_pretrained(model_id)
+        unet = UNet2DModel.from_pretrained(model_id, device_map="auto")
         scheduler = DDIMScheduler()
 
         ddim = DDIMPipeline(unet=unet, scheduler=scheduler)
diff --git a/tests/pipelines/ddpm/test_ddpm.py b/tests/pipelines/ddpm/test_ddpm.py
index c58e2db38f21..c6b24309ced0 100644
--- a/tests/pipelines/ddpm/test_ddpm.py
+++ b/tests/pipelines/ddpm/test_ddpm.py
@@ -38,7 +38,7 @@ class DDPMPipelineIntegrationTests(unittest.TestCase):
     def test_inference_cifar10(self):
         model_id = "google/ddpm-cifar10-32"
 
-        unet = UNet2DModel.from_pretrained(model_id)
+        unet = UNet2DModel.from_pretrained(model_id, device_map="auto")
         scheduler = DDPMScheduler.from_config(model_id)
 
         ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
diff --git a/tests/pipelines/karras_ve/test_karras_ve.py b/tests/pipelines/karras_ve/test_karras_ve.py
index 1fafa1cb40fa..caca35f693cf 100644
--- a/tests/pipelines/karras_ve/test_karras_ve.py
+++ b/tests/pipelines/karras_ve/test_karras_ve.py
@@ -70,7 +70,7 @@ def test_inference(self):
 class KarrasVePipelineIntegrationTests(unittest.TestCase):
     def test_inference(self):
         model_id = "google/ncsnpp-celebahq-256"
-        model = UNet2DModel.from_pretrained(model_id)
+        model = UNet2DModel.from_pretrained(model_id, device_map="auto")
         scheduler = KarrasVeScheduler()
 
         pipe = KarrasVePipeline(unet=model, scheduler=scheduler)
diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion.py b/tests/pipelines/latent_diffusion/test_latent_diffusion.py
index 085cdb4e766b..beb30a24f6d0 100644
--- a/tests/pipelines/latent_diffusion/test_latent_diffusion.py
+++ b/tests/pipelines/latent_diffusion/test_latent_diffusion.py
@@ -121,7 +121,7 @@ def test_inference_text2img(self):
 @require_torch
 class LDMTextToImagePipelineIntegrationTests(unittest.TestCase):
     def test_inference_text2img(self):
-        ldm = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256")
+        ldm = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256", device_map="auto")
         ldm.to(torch_device)
         ldm.set_progress_bar_config(disable=None)
 
@@ -138,7 +138,7 @@ def test_inference_text2img(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
     def test_inference_text2img_fast(self):
-        ldm = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256")
+        ldm = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256", device_map="auto")
         ldm.to(torch_device)
         ldm.set_progress_bar_config(disable=None)
 
diff --git a/tests/pipelines/pndm/test_pndm.py b/tests/pipelines/pndm/test_pndm.py
index 5d9212223e6e..02f5b1fe4fb9 100644
--- a/tests/pipelines/pndm/test_pndm.py
+++ b/tests/pipelines/pndm/test_pndm.py
@@ -71,7 +71,7 @@ class PNDMPipelineIntegrationTests(unittest.TestCase):
     def test_inference_cifar10(self):
         model_id = "google/ddpm-cifar10-32"
 
-        unet = UNet2DModel.from_pretrained(model_id)
+        unet = UNet2DModel.from_pretrained(model_id, device_map="auto")
         scheduler = PNDMScheduler()
 
         pndm = PNDMPipeline(unet=unet, scheduler=scheduler)
diff --git a/tests/pipelines/score_sde_ve/test_score_sde_ve.py b/tests/pipelines/score_sde_ve/test_score_sde_ve.py
index 55dcc1cea1e0..e2f7bc22eca2 100644
--- a/tests/pipelines/score_sde_ve/test_score_sde_ve.py
+++ b/tests/pipelines/score_sde_ve/test_score_sde_ve.py
@@ -72,7 +72,7 @@ def test_inference(self):
 class ScoreSdeVePipelineIntegrationTests(unittest.TestCase):
     def test_inference(self):
         model_id = "google/ncsnpp-church-256"
-        model = UNet2DModel.from_pretrained(model_id)
+        model = UNet2DModel.from_pretrained(model_id, device_map="auto")
 
         scheduler = ScoreSdeVeScheduler.from_config(model_id)
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index a81710987814..464d5ef33bfd 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -528,7 +528,7 @@ def tearDown(self):
 
     def test_stable_diffusion(self):
         # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1", device_map="auto")
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -548,7 +548,7 @@ def test_stable_diffusion(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
     def test_stable_diffusion_fast_ddim(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1", device_map="auto")
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -576,7 +576,7 @@ def test_stable_diffusion_fast_ddim(self):
 
     def test_lms_stable_diffusion_pipeline(self):
         model_id = "CompVis/stable-diffusion-v1-1"
-        pipe = StableDiffusionPipeline.from_pretrained(model_id).to(torch_device)
+        pipe = StableDiffusionPipeline.from_pretrained(model_id, device_map="auto").to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         scheduler = LMSDiscreteScheduler.from_config(model_id, subfolder="scheduler")
         pipe.scheduler = scheduler
@@ -595,9 +595,10 @@ def test_lms_stable_diffusion_pipeline(self):
     def test_stable_diffusion_memory_chunking(self):
         torch.cuda.reset_peak_memory_stats()
         model_id = "CompVis/stable-diffusion-v1-4"
-        pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16).to(
-            torch_device
+        pipe = StableDiffusionPipeline.from_pretrained(
+            model_id, revision="fp16", torch_dtype=torch.float16, device_map="auto"
         )
+        pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
         prompt = "a photograph of an astronaut riding a horse"
@@ -633,9 +634,10 @@ def test_stable_diffusion_memory_chunking(self):
     def test_stable_diffusion_text2img_pipeline_fp16(self):
         torch.cuda.reset_peak_memory_stats()
         model_id = "CompVis/stable-diffusion-v1-4"
-        pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16).to(
-            torch_device
+        pipe = StableDiffusionPipeline.from_pretrained(
+            model_id, revision="fp16", device_map="auto", torch_dtype=torch.float16
         )
+        pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
         prompt = "a photograph of an astronaut riding a horse"
@@ -670,6 +672,7 @@ def test_stable_diffusion_text2img_pipeline(self):
         pipe = StableDiffusionPipeline.from_pretrained(
             model_id,
             safety_checker=None,
+            device_map="auto",
         )
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
@@ -711,7 +714,7 @@ def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> No
         test_callback_fn.has_been_called = False
 
         pipe = StableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
+            "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16, device_map="auto"
         )
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
@@ -737,7 +740,7 @@ def test_stable_diffusion_accelerate_auto_device(self):
 
         start_time = time.time()
         pipeline_normal_load = StableDiffusionPipeline.from_pretrained(
-            pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True
+            pipeline_id, revision="fp16", torch_dtype=torch.float16, device_map="auto"
         )
         pipeline_normal_load.to(torch_device)
         normal_load_time = time.time() - start_time
@@ -758,7 +761,9 @@ def test_stable_diffusion_pipeline_with_unet_on_gpu_only(self):
         pipeline_id = "CompVis/stable-diffusion-v1-4"
         prompt = "Andromeda galaxy in a bottle"
 
-        pipeline = StableDiffusionPipeline.from_pretrained(pipeline_id, revision="fp16", torch_dtype=torch.float16)
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            pipeline_id, revision="fp16", torch_dtype=torch.float16, device_map="auto"
+        )
         pipeline.enable_attention_slicing(1)
         pipeline.enable_sequential_cpu_offload()
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 409bb225567a..7cbb9bc51e81 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -488,6 +488,7 @@ def test_stable_diffusion_img2img_pipeline(self):
         pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
             model_id,
             safety_checker=None,
+            device_map="auto",
         )
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
@@ -529,6 +530,7 @@ def test_stable_diffusion_img2img_pipeline_k_lms(self):
             model_id,
             scheduler=lms,
             safety_checker=None,
+            device_map="auto",
         )
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
@@ -580,7 +582,7 @@ def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> No
         init_image = init_image.resize((768, 512))
 
         pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
+            "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16, device_map="auto"
         )
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 14b4990784e4..b577436f4e5d 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -288,6 +288,7 @@ def test_stable_diffusion_inpaint_pipeline(self):
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
             model_id,
             safety_checker=None,
+            device_map="auto",
         )
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
@@ -329,6 +330,7 @@ def test_stable_diffusion_inpaint_pipeline_fp16(self):
             revision="fp16",
             torch_dtype=torch.float16,
             safety_checker=None,
+            device_map="auto",
         )
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
@@ -366,7 +368,9 @@ def test_stable_diffusion_inpaint_pipeline_pndm(self):
 
         pndm = PNDMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True)
         model_id = "runwayml/stable-diffusion-inpainting"
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None, scheduler=pndm)
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            model_id, safety_checker=None, scheduler=pndm, device_map="auto"
+        )
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
index 697da4c921df..90dbf3b63604 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
@@ -368,6 +368,7 @@ def test_stable_diffusion_inpaint_legacy_pipeline(self):
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
             model_id,
             safety_checker=None,
+            device_map="auto",
         )
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
@@ -413,6 +414,7 @@ def test_stable_diffusion_inpaint_legacy_pipeline_k_lms(self):
             model_id,
             scheduler=lms,
             safety_checker=None,
+            device_map="auto",
         )
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
@@ -469,7 +471,7 @@ def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> No
         )
 
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
+            "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16, device_map="auto"
         )
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 3cc94962c38f..5ce07dee8d57 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -108,8 +108,8 @@ def test_local_custom_pipeline(self):
     def test_load_pipeline_from_git(self):
         clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
 
-        feature_extractor = CLIPFeatureExtractor.from_pretrained(clip_model_id)
-        clip_model = CLIPModel.from_pretrained(clip_model_id, torch_dtype=torch.float16)
+        feature_extractor = CLIPFeatureExtractor.from_pretrained(clip_model_id, device_map="auto")
+        clip_model = CLIPModel.from_pretrained(clip_model_id, torch_dtype=torch.float16, device_map="auto")
 
         pipeline = DiffusionPipeline.from_pretrained(
             "CompVis/stable-diffusion-v1-4",
@@ -118,6 +118,7 @@ def test_load_pipeline_from_git(self):
             feature_extractor=feature_extractor,
             torch_dtype=torch.float16,
             revision="fp16",
+            device_map="auto",
         )
         pipeline.enable_attention_slicing()
         pipeline = pipeline.to(torch_device)
@@ -312,7 +313,9 @@ def tearDown(self):
     def test_smart_download(self):
         model_id = "hf-internal-testing/unet-pipeline-dummy"
         with tempfile.TemporaryDirectory() as tmpdirname:
-            _ = DiffusionPipeline.from_pretrained(model_id, cache_dir=tmpdirname, force_download=True)
+            _ = DiffusionPipeline.from_pretrained(
+                model_id, cache_dir=tmpdirname, force_download=True, device_map="auto"
+            )
             local_repo_name = "--".join(["models"] + model_id.split("/"))
             snapshot_dir = os.path.join(tmpdirname, local_repo_name, "snapshots")
             snapshot_dir = os.path.join(snapshot_dir, os.listdir(snapshot_dir)[0])
@@ -335,7 +338,9 @@ def test_warning_unused_kwargs(self):
         logger = logging.get_logger("diffusers.pipeline_utils")
         with tempfile.TemporaryDirectory() as tmpdirname:
             with CaptureLogger(logger) as cap_logger:
-                DiffusionPipeline.from_pretrained(model_id, not_used=True, cache_dir=tmpdirname, force_download=True)
+                DiffusionPipeline.from_pretrained(
+                    model_id, not_used=True, cache_dir=tmpdirname, force_download=True, device_map="auto"
+                )
 
         assert cap_logger.out == "Keyword arguments {'not_used': True} not recognized.\n"
 
@@ -358,7 +363,7 @@ def test_from_pretrained_save_pretrained(self):
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             ddpm.save_pretrained(tmpdirname)
-            new_ddpm = DDPMPipeline.from_pretrained(tmpdirname)
+            new_ddpm = DDPMPipeline.from_pretrained(tmpdirname, device_map="auto")
             new_ddpm.to(torch_device)
 
         generator = torch.manual_seed(0)
@@ -374,10 +379,10 @@ def test_from_pretrained_hub(self):
 
         scheduler = DDPMScheduler(num_train_timesteps=10)
 
-        ddpm = DDPMPipeline.from_pretrained(model_path, scheduler=scheduler)
+        ddpm = DDPMPipeline.from_pretrained(model_path, scheduler=scheduler, device_map="auto")
         ddpm.to(torch_device)
         ddpm.set_progress_bar_config(disable=None)
-        ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler)
+        ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler, device_map="auto")
         ddpm_from_hub.to(torch_device)
         ddpm_from_hub.set_progress_bar_config(disable=None)
 
@@ -395,12 +400,14 @@ def test_from_pretrained_hub_pass_model(self):
         scheduler = DDPMScheduler(num_train_timesteps=10)
 
         # pass unet into DiffusionPipeline
-        unet = UNet2DModel.from_pretrained(model_path)
-        ddpm_from_hub_custom_model = DiffusionPipeline.from_pretrained(model_path, unet=unet, scheduler=scheduler)
+        unet = UNet2DModel.from_pretrained(model_path, device_map="auto")
+        ddpm_from_hub_custom_model = DiffusionPipeline.from_pretrained(
+            model_path, unet=unet, scheduler=scheduler, device_map="auto"
+        )
         ddpm_from_hub_custom_model.to(torch_device)
         ddpm_from_hub_custom_model.set_progress_bar_config(disable=None)
 
-        ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler)
+        ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler, device_map="auto")
         ddpm_from_hub.to(torch_device)
         ddpm_from_hub_custom_model.set_progress_bar_config(disable=None)
 
@@ -415,7 +422,7 @@ def test_from_pretrained_hub_pass_model(self):
     def test_output_format(self):
         model_path = "google/ddpm-cifar10-32"
 
-        pipe = DDIMPipeline.from_pretrained(model_path)
+        pipe = DDIMPipeline.from_pretrained(model_path, device_map="auto")
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
@@ -437,7 +444,7 @@ def test_output_format(self):
     def test_ddpm_ddim_equality(self):
         model_id = "google/ddpm-cifar10-32"
 
-        unet = UNet2DModel.from_pretrained(model_id)
+        unet = UNet2DModel.from_pretrained(model_id, device_map="auto")
         ddpm_scheduler = DDPMScheduler()
         ddim_scheduler = DDIMScheduler()
 
@@ -461,7 +468,7 @@ def test_ddpm_ddim_equality(self):
     def test_ddpm_ddim_equality_batched(self):
         model_id = "google/ddpm-cifar10-32"
 
-        unet = UNet2DModel.from_pretrained(model_id)
+        unet = UNet2DModel.from_pretrained(model_id, device_map="auto")
         ddpm_scheduler = DDPMScheduler()
         ddim_scheduler = DDIMScheduler()
 

From 8d6487f3cbe89bb6e32f82fc9f04df6ce001ef24 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 28 Oct 2022 17:05:00 +0200
Subject: [PATCH 075/269] Fix some failing tests (#1041)

* up

* up

* up

* Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py

* Apply suggestions from code review
---
 .../latent_diffusion/pipeline_latent_diffusion.py  |  2 ++
 .../stable_diffusion/pipeline_stable_diffusion.py  |  1 -
 .../stable_diffusion/test_stable_diffusion.py      |  6 ++----
 tests/test_pipelines.py                            | 14 +++++++++-----
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
index 62a5785beb2f..d8948862845d 100644
--- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
@@ -662,6 +662,8 @@ def custom_forward(*inputs):
 
 
 class LDMBertModel(LDMBertPreTrainedModel):
+    _no_split_modules = []
+
     def __init__(self, config: LDMBertConfig):
         super().__init__(config)
         self.model = LDMBertEncoder(config)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index e80cc1360b33..2a33074a2473 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -208,7 +208,6 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
-
         if isinstance(prompt, str):
             batch_size = 1
         elif isinstance(prompt, list):
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 464d5ef33bfd..6b23aca33bc6 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -740,7 +740,7 @@ def test_stable_diffusion_accelerate_auto_device(self):
 
         start_time = time.time()
         pipeline_normal_load = StableDiffusionPipeline.from_pretrained(
-            pipeline_id, revision="fp16", torch_dtype=torch.float16, device_map="auto"
+            pipeline_id, revision="fp16", torch_dtype=torch.float16
         )
         pipeline_normal_load.to(torch_device)
         normal_load_time = time.time() - start_time
@@ -761,9 +761,7 @@ def test_stable_diffusion_pipeline_with_unet_on_gpu_only(self):
         pipeline_id = "CompVis/stable-diffusion-v1-4"
         prompt = "Andromeda galaxy in a bottle"
 
-        pipeline = StableDiffusionPipeline.from_pretrained(
-            pipeline_id, revision="fp16", torch_dtype=torch.float16, device_map="auto"
-        )
+        pipeline = StableDiffusionPipeline.from_pretrained(pipeline_id, revision="fp16", torch_dtype=torch.float16)
         pipeline.enable_attention_slicing(1)
         pipeline.enable_sequential_cpu_offload()
 
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 5ce07dee8d57..e355a19493ed 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -77,6 +77,7 @@ def test_load_custom_pipeline(self):
         pipeline = DiffusionPipeline.from_pretrained(
             "google/ddpm-cifar10-32", custom_pipeline="hf-internal-testing/diffusers-dummy-pipeline"
         )
+        pipeline = pipeline.to(torch_device)
         # NOTE that `"CustomPipeline"` is not a class that is defined in this library, but solely on the Hub
         # under https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py#L24
         assert pipeline.__class__.__name__ == "CustomPipeline"
@@ -85,6 +86,7 @@ def test_run_custom_pipeline(self):
         pipeline = DiffusionPipeline.from_pretrained(
             "google/ddpm-cifar10-32", custom_pipeline="hf-internal-testing/diffusers-dummy-pipeline"
         )
+        pipeline = pipeline.to(torch_device)
         images, output_str = pipeline(num_inference_steps=2, output_type="np")
 
         assert images[0].shape == (1, 32, 32, 3)
@@ -96,6 +98,7 @@ def test_local_custom_pipeline(self):
         pipeline = DiffusionPipeline.from_pretrained(
             "google/ddpm-cifar10-32", custom_pipeline=local_custom_pipeline_path
         )
+        pipeline = pipeline.to(torch_device)
         images, output_str = pipeline(num_inference_steps=2, output_type="np")
 
         assert pipeline.__class__.__name__ == "CustomLocalPipeline"
@@ -109,7 +112,7 @@ def test_load_pipeline_from_git(self):
         clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
 
         feature_extractor = CLIPFeatureExtractor.from_pretrained(clip_model_id, device_map="auto")
-        clip_model = CLIPModel.from_pretrained(clip_model_id, torch_dtype=torch.float16, device_map="auto")
+        clip_model = CLIPModel.from_pretrained(clip_model_id, torch_dtype=torch.float16)
 
         pipeline = DiffusionPipeline.from_pretrained(
             "CompVis/stable-diffusion-v1-4",
@@ -380,10 +383,11 @@ def test_from_pretrained_hub(self):
         scheduler = DDPMScheduler(num_train_timesteps=10)
 
         ddpm = DDPMPipeline.from_pretrained(model_path, scheduler=scheduler, device_map="auto")
-        ddpm.to(torch_device)
+        ddpm = ddpm.to(torch_device)
         ddpm.set_progress_bar_config(disable=None)
+
         ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler, device_map="auto")
-        ddpm_from_hub.to(torch_device)
+        ddpm_from_hub = ddpm_from_hub.to(torch_device)
         ddpm_from_hub.set_progress_bar_config(disable=None)
 
         generator = torch.manual_seed(0)
@@ -404,11 +408,11 @@ def test_from_pretrained_hub_pass_model(self):
         ddpm_from_hub_custom_model = DiffusionPipeline.from_pretrained(
             model_path, unet=unet, scheduler=scheduler, device_map="auto"
         )
-        ddpm_from_hub_custom_model.to(torch_device)
+        ddpm_from_hub_custom_model = ddpm_from_hub_custom_model.to(torch_device)
         ddpm_from_hub_custom_model.set_progress_bar_config(disable=None)
 
         ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler, device_map="auto")
-        ddpm_from_hub.to(torch_device)
+        ddpm_from_hub = ddpm_from_hub.to(torch_device)
         ddpm_from_hub_custom_model.set_progress_bar_config(disable=None)
 
         generator = torch.manual_seed(0)

From c4ef1efe4649b87cb4fc6ff3547556c4aaa6fb64 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 28 Oct 2022 17:38:31 +0200
Subject: [PATCH 076/269] [Tests] Better prints (#1043)

---
 src/diffusers/utils/__init__.py      | 10 +++++++++-
 src/diffusers/utils/testing_utils.py |  8 ++++++++
 tests/models/test_models_unet_2d.py  | 24 ++++++++++++------------
 tests/models/test_models_vae.py      | 16 ++++++++--------
 4 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index ae00c1cf6a10..44399b682648 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -40,7 +40,15 @@
 
 
 if is_torch_available():
-    from .testing_utils import floats_tensor, load_image, parse_flag_from_env, require_torch_gpu, slow, torch_device
+    from .testing_utils import (
+        floats_tensor,
+        load_image,
+        parse_flag_from_env,
+        require_torch_gpu,
+        slow,
+        torch_all_close,
+        torch_device,
+    )
 
 
 logger = get_logger(__name__)
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index 700faf209926..3ae28261b60b 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -34,6 +34,14 @@
         torch_device = "mps" if (mps_backend_registered and torch.backends.mps.is_available()) else torch_device
 
 
+def torch_all_close(a, b, *args, **kwargs):
+    if not is_torch_available():
+        raise ValueError("PyTorch needs to be installed to use this function.")
+    if not torch.allclose(a, b, *args, **kwargs):
+        assert False, f"Max diff is absolute {(a - b).abs().max()}. Diff tensor is {(a - b).abs()}."
+    return True
+
+
 def get_tests_dir(append_path=None):
     """
     Args:
diff --git a/tests/models/test_models_unet_2d.py b/tests/models/test_models_unet_2d.py
index b2aa90b236d0..ac71deeaf0eb 100644
--- a/tests/models/test_models_unet_2d.py
+++ b/tests/models/test_models_unet_2d.py
@@ -21,7 +21,7 @@
 import torch
 
 from diffusers import UNet2DConditionModel, UNet2DModel
-from diffusers.utils import floats_tensor, require_torch_gpu, slow, torch_device
+from diffusers.utils import floats_tensor, require_torch_gpu, slow, torch_all_close, torch_device
 from parameterized import parameterized
 
 from ..test_modeling_common import ModelTesterMixin
@@ -156,7 +156,7 @@ def test_from_pretrained_accelerate_wont_change_results(self):
         model_normal_load.eval()
         arr_normal_load = model_normal_load(noise, time_step)["sample"]
 
-        assert torch.allclose(arr_accelerate, arr_normal_load, rtol=1e-3)
+        assert torch_all_close(arr_accelerate, arr_normal_load, rtol=1e-3)
 
     @unittest.skipIf(torch_device != "cuda", "This test is supposed to run on GPU")
     def test_memory_footprint_gets_reduced(self):
@@ -207,7 +207,7 @@ def test_output_pretrained(self):
         expected_output_slice = torch.tensor([-13.3258, -20.1100, -15.9873, -17.6617, -23.0596, -17.9419, -13.3675, -16.1889, -12.3800])
         # fmt: on
 
-        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-3))
+        self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-3))
 
 
 class UNet2DConditionModelTests(ModelTesterMixin, unittest.TestCase):
@@ -287,7 +287,7 @@ def test_gradient_checkpointing(self):
         named_params = dict(model.named_parameters())
         named_params_2 = dict(model_2.named_parameters())
         for name, param in named_params.items():
-            self.assertTrue(torch.allclose(param.grad.data, named_params_2[name].grad.data, atol=5e-5))
+            self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=5e-5))
 
 
 class NCSNppModelTests(ModelTesterMixin, unittest.TestCase):
@@ -377,7 +377,7 @@ def test_output_pretrained_ve_mid(self):
         expected_output_slice = torch.tensor([-4836.2231, -6487.1387, -3816.7969, -7964.9253, -10966.2842, -20043.6016, 8137.0571, 2340.3499, 544.6114])
         # fmt: on
 
-        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2))
+        self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2))
 
     def test_output_pretrained_ve_large(self):
         model = UNet2DModel.from_pretrained("fusing/ncsnpp-ffhq-ve-dummy-update")
@@ -402,7 +402,7 @@ def test_output_pretrained_ve_large(self):
         expected_output_slice = torch.tensor([-0.0325, -0.0900, -0.0869, -0.0332, -0.0725, -0.0270, -0.0101, 0.0227, 0.0256])
         # fmt: on
 
-        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2))
+        self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2))
 
     def test_forward_with_norm_groups(self):
         # not required for this model
@@ -464,7 +464,7 @@ def test_compvis_sd_v1_4(self, seed, timestep, expected_slice):
         output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
 
     @parameterized.expand(
         [
@@ -490,7 +490,7 @@ def test_compvis_sd_v1_4_fp16(self, seed, timestep, expected_slice):
         output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
 
     @parameterized.expand(
         [
@@ -515,7 +515,7 @@ def test_compvis_sd_v1_5(self, seed, timestep, expected_slice):
         output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
 
     @parameterized.expand(
         [
@@ -541,7 +541,7 @@ def test_compvis_sd_v1_5_fp16(self, seed, timestep, expected_slice):
         output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
 
     @parameterized.expand(
         [
@@ -566,7 +566,7 @@ def test_compvis_sd_inpaint(self, seed, timestep, expected_slice):
         output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
 
     @parameterized.expand(
         [
@@ -592,4 +592,4 @@ def test_compvis_sd_inpaint_fp16(self, seed, timestep, expected_slice):
         output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py
index 65fe81fd3379..c35d88fa4e41 100644
--- a/tests/models/test_models_vae.py
+++ b/tests/models/test_models_vae.py
@@ -20,7 +20,7 @@
 
 from diffusers import AutoencoderKL
 from diffusers.modeling_utils import ModelMixin
-from diffusers.utils import floats_tensor, require_torch_gpu, slow, torch_device
+from diffusers.utils import floats_tensor, require_torch_gpu, slow, torch_all_close, torch_device
 from parameterized import parameterized
 
 from ..test_modeling_common import ModelTesterMixin
@@ -131,7 +131,7 @@ def test_output_pretrained(self):
                 [-0.2421, 0.4642, 0.2507, -0.0438, 0.0682, 0.3160, -0.2018, -0.0727, 0.2485]
             )
 
-        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2))
+        self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2))
 
 
 @slow
@@ -185,7 +185,7 @@ def test_stable_diffusion(self, seed, expected_slice):
         output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
 
     @parameterized.expand(
         [
@@ -209,7 +209,7 @@ def test_stable_diffusion_fp16(self, seed, expected_slice):
         output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
 
     @parameterized.expand(
         [
@@ -231,7 +231,7 @@ def test_stable_diffusion_mode(self, seed, expected_slice):
         output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
 
     @parameterized.expand(
         [
@@ -254,7 +254,7 @@ def test_stable_diffusion_decode(self, seed, expected_slice):
         output_slice = sample[-1, -2:, :2, -2:].flatten().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
 
     @parameterized.expand(
         [
@@ -276,7 +276,7 @@ def test_stable_diffusion_decode_fp16(self, seed, expected_slice):
         output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
 
     @parameterized.expand(
         [
@@ -300,4 +300,4 @@ def test_stable_diffusion_encode_sample(self, seed, expected_slice):
         output_slice = sample[0, -1, -3:, -3:].flatten().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch.allclose(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)

From d37f08da723cba38965b2df8b21a8136af9f358f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 28 Oct 2022 18:52:25 +0200
Subject: [PATCH 077/269] [Tests] no random latents anymore (#1045)

---
 src/diffusers/utils/__init__.py      |  1 +
 src/diffusers/utils/testing_utils.py | 18 +++++++++++++++++-
 tests/models/test_models_unet_2d.py  | 14 +++++++-------
 tests/models/test_models_vae.py      | 12 ++++++------
 4 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index 44399b682648..12d731128385 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -43,6 +43,7 @@
     from .testing_utils import (
         floats_tensor,
         load_image,
+        load_numpy,
         parse_flag_from_env,
         require_torch_gpu,
         slow,
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index 3ae28261b60b..bd3b08d54a1c 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -4,11 +4,14 @@
 import random
 import re
 import unittest
+import urllib.parse
 from distutils.util import strtobool
-from io import StringIO
+from io import BytesIO, StringIO
 from pathlib import Path
 from typing import Union
 
+import numpy as np
+
 import PIL.Image
 import PIL.ImageOps
 import requests
@@ -165,6 +168,19 @@ def load_image(image: Union[str, PIL.Image.Image]) -> PIL.Image.Image:
     return image
 
 
+def load_numpy(path) -> np.ndarray:
+    if not path.startswith("http://") or path.startswith("https://"):
+        path = os.path.join(
+            "https://huggingface.co/datasets/fusing/diffusers-testing/resolve/main", urllib.parse.quote(path)
+        )
+
+    response = requests.get(path)
+    response.raise_for_status()
+    array = np.load(BytesIO(response.content))
+
+    return array
+
+
 # --- pytest conf functions --- #
 
 # to avoid multiple invocation from tests/conftest.py and examples/conftest.py - make sure it's called only once
diff --git a/tests/models/test_models_unet_2d.py b/tests/models/test_models_unet_2d.py
index ac71deeaf0eb..03d14f4d3e6e 100644
--- a/tests/models/test_models_unet_2d.py
+++ b/tests/models/test_models_unet_2d.py
@@ -21,7 +21,7 @@
 import torch
 
 from diffusers import UNet2DConditionModel, UNet2DModel
-from diffusers.utils import floats_tensor, require_torch_gpu, slow, torch_all_close, torch_device
+from diffusers.utils import floats_tensor, load_numpy, require_torch_gpu, slow, torch_all_close, torch_device
 from parameterized import parameterized
 
 from ..test_modeling_common import ModelTesterMixin
@@ -411,6 +411,9 @@ def test_forward_with_norm_groups(self):
 
 @slow
 class UNet2DConditionModelIntegrationTests(unittest.TestCase):
+    def get_file_format(self, seed, shape):
+        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
+
     def tearDown(self):
         # clean up the VRAM after each test
         super().tearDown()
@@ -418,11 +421,8 @@ def tearDown(self):
         torch.cuda.empty_cache()
 
     def get_latents(self, seed=0, shape=(4, 4, 64, 64), fp16=False):
-        batch_size, channels, height, width = shape
-        generator = torch.Generator(device=torch_device).manual_seed(seed)
         dtype = torch.float16 if fp16 else torch.float32
-        image = torch.randn(batch_size, channels, height, width, device=torch_device, generator=generator, dtype=dtype)
-
+        image = torch.from_numpy(load_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
         return image
 
     def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"):
@@ -437,9 +437,9 @@ def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"):
         return model
 
     def get_encoder_hidden_states(self, seed=0, shape=(4, 77, 768), fp16=False):
-        generator = torch.Generator(device=torch_device).manual_seed(seed)
         dtype = torch.float16 if fp16 else torch.float32
-        return torch.randn(shape, device=torch_device, generator=generator, dtype=dtype)
+        hidden_states = torch.from_numpy(load_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
+        return hidden_states
 
     @parameterized.expand(
         [
diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py
index c35d88fa4e41..fd5461f66c03 100644
--- a/tests/models/test_models_vae.py
+++ b/tests/models/test_models_vae.py
@@ -20,7 +20,7 @@
 
 from diffusers import AutoencoderKL
 from diffusers.modeling_utils import ModelMixin
-from diffusers.utils import floats_tensor, require_torch_gpu, slow, torch_all_close, torch_device
+from diffusers.utils import floats_tensor, load_numpy, require_torch_gpu, slow, torch_all_close, torch_device
 from parameterized import parameterized
 
 from ..test_modeling_common import ModelTesterMixin
@@ -136,18 +136,18 @@ def test_output_pretrained(self):
 
 @slow
 class AutoencoderKLIntegrationTests(unittest.TestCase):
+    def get_file_format(self, seed, shape):
+        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
+
     def tearDown(self):
         # clean up the VRAM after each test
         super().tearDown()
         gc.collect()
         torch.cuda.empty_cache()
 
-    def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
-        batch_size, channels, height, width = shape
-        generator = torch.Generator(device=torch_device).manual_seed(seed)
+    def get_sd_image(self, seed=0, shape=(4, 4, 64, 64), fp16=False):
         dtype = torch.float16 if fp16 else torch.float32
-        image = torch.randn(batch_size, channels, height, width, device=torch_device, generator=generator, dtype=dtype)
-
+        image = torch.from_numpy(load_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
         return image
 
     def get_sd_vae_model(self, model_id="CompVis/stable-diffusion-v1-4", fp16=False):

From cbbb29398a0f94320b78a789b08fecddd1ada97f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 28 Oct 2022 16:55:21 +0000
Subject: [PATCH 078/269] hot fix

---
 tests/models/test_models_vae.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py
index c35d88fa4e41..3d0b5f115584 100644
--- a/tests/models/test_models_vae.py
+++ b/tests/models/test_models_vae.py
@@ -20,7 +20,7 @@
 
 from diffusers import AutoencoderKL
 from diffusers.modeling_utils import ModelMixin
-from diffusers.utils import floats_tensor, require_torch_gpu, slow, torch_all_close, torch_device
+from diffusers.utils import floats_tensor, load_numpy, require_torch_gpu, slow, torch_all_close, torch_device
 from parameterized import parameterized
 
 from ..test_modeling_common import ModelTesterMixin
@@ -136,6 +136,9 @@ def test_output_pretrained(self):
 
 @slow
 class AutoencoderKLIntegrationTests(unittest.TestCase):
+    def get_file_format(self, seed, shape):
+        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
+
     def tearDown(self):
         # clean up the VRAM after each test
         super().tearDown()
@@ -143,11 +146,8 @@ def tearDown(self):
         torch.cuda.empty_cache()
 
     def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
-        batch_size, channels, height, width = shape
-        generator = torch.Generator(device=torch_device).manual_seed(seed)
         dtype = torch.float16 if fp16 else torch.float32
-        image = torch.randn(batch_size, channels, height, width, device=torch_device, generator=generator, dtype=dtype)
-
+        image = torch.from_numpy(load_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
         return image
 
     def get_sd_vae_model(self, model_id="CompVis/stable-diffusion-v1-4", fp16=False):

From a7ae808ee25464b75b192d7a50649e217dbd921f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 28 Oct 2022 17:50:22 +0000
Subject: [PATCH 079/269] increase tolerance

---
 tests/models/test_models_unet_2d.py | 15 ++++++++-------
 tests/models/test_models_vae.py     | 12 ++++++------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/tests/models/test_models_unet_2d.py b/tests/models/test_models_unet_2d.py
index 03d14f4d3e6e..548588918c88 100644
--- a/tests/models/test_models_unet_2d.py
+++ b/tests/models/test_models_unet_2d.py
@@ -21,12 +21,13 @@
 import torch
 
 from diffusers import UNet2DConditionModel, UNet2DModel
-from diffusers.utils import floats_tensor, load_numpy, require_torch_gpu, slow, torch_all_close, torch_device
+from diffusers.utils import floats_tensor, load_numpy, logging, require_torch_gpu, slow, torch_all_close, torch_device
 from parameterized import parameterized
 
 from ..test_modeling_common import ModelTesterMixin
 
 
+logger = logging.get_logger(__name__)
 torch.backends.cuda.matmul.allow_tf32 = False
 
 
@@ -464,7 +465,7 @@ def test_compvis_sd_v1_4(self, seed, timestep, expected_slice):
         output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
 
     @parameterized.expand(
         [
@@ -490,7 +491,7 @@ def test_compvis_sd_v1_4_fp16(self, seed, timestep, expected_slice):
         output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
 
     @parameterized.expand(
         [
@@ -515,7 +516,7 @@ def test_compvis_sd_v1_5(self, seed, timestep, expected_slice):
         output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
 
     @parameterized.expand(
         [
@@ -541,7 +542,7 @@ def test_compvis_sd_v1_5_fp16(self, seed, timestep, expected_slice):
         output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
 
     @parameterized.expand(
         [
@@ -566,7 +567,7 @@ def test_compvis_sd_inpaint(self, seed, timestep, expected_slice):
         output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
 
     @parameterized.expand(
         [
@@ -592,4 +593,4 @@ def test_compvis_sd_inpaint_fp16(self, seed, timestep, expected_slice):
         output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py
index 3d0b5f115584..95e5f656af17 100644
--- a/tests/models/test_models_vae.py
+++ b/tests/models/test_models_vae.py
@@ -185,7 +185,7 @@ def test_stable_diffusion(self, seed, expected_slice):
         output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
 
     @parameterized.expand(
         [
@@ -209,7 +209,7 @@ def test_stable_diffusion_fp16(self, seed, expected_slice):
         output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
 
     @parameterized.expand(
         [
@@ -231,7 +231,7 @@ def test_stable_diffusion_mode(self, seed, expected_slice):
         output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
 
     @parameterized.expand(
         [
@@ -254,7 +254,7 @@ def test_stable_diffusion_decode(self, seed, expected_slice):
         output_slice = sample[-1, -2:, :2, -2:].flatten().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
 
     @parameterized.expand(
         [
@@ -276,7 +276,7 @@ def test_stable_diffusion_decode_fp16(self, seed, expected_slice):
         output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
 
     @parameterized.expand(
         [
@@ -300,4 +300,4 @@ def test_stable_diffusion_encode_sample(self, seed, expected_slice):
         output_slice = sample[0, -1, -3:, -3:].flatten().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-4)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)

From 81b6fbf19d115de02b53dc34afe23b4833fa45da Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 28 Oct 2022 18:19:06 +0000
Subject: [PATCH 080/269] higher precision for vae

---
 tests/models/test_models_vae.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py
index 95e5f656af17..f6333d6cd906 100644
--- a/tests/models/test_models_vae.py
+++ b/tests/models/test_models_vae.py
@@ -209,7 +209,7 @@ def test_stable_diffusion_fp16(self, seed, expected_slice):
         output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu()
         expected_output_slice = torch.tensor(expected_slice)
 
-        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
+        assert torch_all_close(output_slice, expected_output_slice, atol=1e-2)
 
     @parameterized.expand(
         [

From 6b185b6acdc6eb4b658ec91fc60a47113b8dcca1 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Fri, 28 Oct 2022 21:02:08 +0200
Subject: [PATCH 081/269] Update training and fine-tuning docs (#1020)

* Update training and fine-tuning docs.

* Update examples README.

* Update README.

* Add Flax fine-tuning section.

* Accept suggestion

Co-authored-by: Anton Lozhkov <anton@huggingface.co>

* Accept suggestion

Co-authored-by: Anton Lozhkov <anton@huggingface.co>

Co-authored-by: Anton Lozhkov <anton@huggingface.co>
---
 README.md                               |  25 ++-
 docs/source/_toctree.yml                |   6 +-
 docs/source/training/dreambooth.mdx     | 240 ++++++++++++++++++++++++
 docs/source/training/overview.mdx       |   8 +-
 docs/source/training/text2image.mdx     | 126 ++++++++++++-
 docs/source/training/text_inversion.mdx |   2 +-
 examples/README.md                      |   8 +-
 examples/text_to_image/README.md        |   2 +-
 8 files changed, 402 insertions(+), 15 deletions(-)
 create mode 100644 docs/source/training/dreambooth.mdx

diff --git a/README.md b/README.md
index ac518fb8ec76..2302137c1e72 100644
--- a/README.md
+++ b/README.md
@@ -182,9 +182,9 @@ image.save("astronaut_rides_horse.png")
 
 ### JAX/Flax
 
-To use StableDiffusion on TPUs and GPUs for faster inference you can leverage JAX/Flax.
+Diffusers offers a JAX / Flax implementation of Stable Diffusion for very fast inference. JAX shines specially on TPU hardware because each TPU server has 8 accelerators working in parallel, but it runs great on GPUs too.
 
-Running the pipeline with default PNDMScheduler
+Running the pipeline with the default PNDMScheduler:
 
 ```python
 import jax
@@ -331,8 +331,25 @@ You can generate your own latents to reproduce results, or tweak your prompt on
 
 For more details, check out [the Stable Diffusion notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb)
 and have a look into the [release notes](https://github.com/huggingface/diffusers/releases/tag/v0.2.0).
-  
-## Examples
+
+## Fine-Tuning Stable Diffusion
+
+Fine-tuning techniques make it possible to adapt Stable Diffusion to your own dataset, or add new subjects to it. These are some of the techniques supported in `diffusers`:
+
+Textual Inversion is a technique for capturing novel concepts from a small number of example images in a way that can later be used to control text-to-image pipelines. It does so by learning new 'words' in the embedding space of the pipeline's text encoder. These special words can then be used within text prompts to achieve very fine-grained control of the resulting images. 
+
+- Textual Inversion. Capture novel concepts from a small set of sample images, and associate them with new "words" in the embedding space of the text encoder. Please, refer to [our training examples](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion) or [documentation](https://huggingface.co/docs/diffusers/training/text_inversion) to try for yourself.
+
+- Dreambooth. Another technique to capture new concepts in Stable Diffusion. This method fine-tunes the UNet (and, optionally, also the text encoder) of the pipeline to achieve impressive results. Please, refer to [our training examples](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) and [training report](https://wandb.ai/psuraj/dreambooth/reports/Dreambooth-Training-Analysis--VmlldzoyNzk0NDc3) for additional details and training recommendations.
+
+- Full Stable Diffusion fine-tuning. If you have a more sizable dataset with a specific look or style, you can fine-tune Stable Diffusion so that it outputs images following those examples. This was the approach taken to create [a Pokémon Stable Diffusion model](https://huggingface.co/justinpinkney/pokemon-stable-diffusion) (by Justing Pinkney / Lambda Labs), [a Japanese specific version of Stable Diffusion](https://huggingface.co/spaces/rinna/japanese-stable-diffusion) (by [Rinna Co.](https://github.com/rinnakk/japanese-stable-diffusion/) and others. You can start at [our text-to-image fine-tuning example](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image) and go from there.
+
+
+## Stable Diffusion Community Pipelines
+
+The release of Stable Diffusion as an open source model has fostered a lot of interesting ideas and experimentation. Our [Community Examples folder](https://github.com/huggingface/diffusers/tree/main/examples/community) contains many ideas worth exploring, like interpolating to create animated videos, using CLIP Guidance for additional prompt fidelity, term weighting, and much more! Take a look and [contribute your own](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipelines).
+
+## Other Examples
 
 There are many ways to try running Diffusers! Here we outline code-focused tools (primarily using `DiffusionPipeline`s and Google Colab) and interactive web-tools.
 
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 295e24674854..7e46d95a46f1 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -46,9 +46,11 @@
   - local: training/unconditional_training
     title: "Unconditional Image Generation"
   - local: training/text_inversion
-    title: "Text Inversion"
+    title: "Textual Inversion"
+  - local: training/dreambooth
+    title: "Dreambooth"
   - local: training/text2image
-    title: "Text-to-image"
+    title: "Text-to-image fine-tuning"
   title: "Training"
 - sections:
   - local: conceptual/stable_diffusion
diff --git a/docs/source/training/dreambooth.mdx b/docs/source/training/dreambooth.mdx
new file mode 100644
index 000000000000..cf7e5dbcec41
--- /dev/null
+++ b/docs/source/training/dreambooth.mdx
@@ -0,0 +1,240 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# DreamBooth fine-tuning example
+
+[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text-to-image models like stable diffusion given just a few (3~5) images of a subject.
+
+![Dreambooth examples from the project's blog](https://dreambooth.github.io/DreamBooth_files/teaser_static.jpg)
+_Dreambooth examples from the [project's blog](https://dreambooth.github.io)._
+
+The [Dreambooth training script](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) shows how to implement this training procedure on a pre-trained Stable Diffusion model.
+
+<Tip warning={true}>
+
+<!-- TODO: replace with our blog when it's done -->
+
+Dreambooth fine-tuning is very sensitive to hyperparameters and easy to overfit. We recommend you take a look at our [in-depth analysis](https://wandb.ai/psuraj/dreambooth/reports/Dreambooth-Training-Analysis--VmlldzoyNzk0NDc3) with recommended settings for different subjects, and go from there.
+
+</Tip>
+
+## Training locally 
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies. We also recommend to install `diffusers` from the `main` github branch.
+
+```bash
+pip install git+https://github.com/huggingface/diffusers
+pip install -U -r diffusers/examples/dreambooth/requirements.txt
+```
+
+Then initialize and configure a [🤗 Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. 
+
+You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
+
+Run the following command to authenticate your token
+
+```bash
+huggingface-cli login
+```
+
+If you have already cloned the repo, then you won't need to go through these steps. Instead, you can pass the path to your local checkout to the training script and it will be loaded from there.
+
+### Dog toy example
+
+In this example we'll use [these images](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ) to add a new concept to Stable Diffusion using the Dreambooth process. They will be our training data. Please, download them and place them somewhere in your system.
+
+Then you can launch the training script using:
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path_to_training_images"
+export OUTPUT_DIR="path_to_saved_model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --instance_prompt="a photo of sks dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=400
+```
+
+### Training with a prior-preserving loss
+
+Prior preservation is used to avoid overfitting and language-drift. Please, refer to the paper to learn more about it if you are interested. For prior preservation, we use other images of the same class as part of the training process. The nice thing is that we can generate those images using the Stable Diffusion model itself! The training script will save the generated images to a local path we specify.
+
+According to the paper, it's recommended to generate `num_epochs * num_samples` images for prior preservation. 200-300 works well for most cases.
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path_to_training_images"
+export CLASS_DIR="path_to_class_images"
+export OUTPUT_DIR="path_to_saved_model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
+
+### Training on a 16GB GPU
+
+With the help of gradient checkpointing and the 8-bit optimizer from [bitsandbytes](https://github.com/TimDettmers/bitsandbytes), it's possible to train dreambooth on a 16GB GPU.
+
+```bash
+pip install bitsandbytes
+```
+
+Then pass the `--use_8bit_adam` option to the training script.
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path_to_training_images"
+export CLASS_DIR="path_to_class_images"
+export OUTPUT_DIR="path_to_saved_model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=2 --gradient_checkpointing \
+  --use_8bit_adam \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
+
+### Fine-tune the text encoder in addition to the UNet
+
+The script also allows to fine-tune the `text_encoder` along with the `unet`. It has been observed experimentally that this gives much better results, especially on faces. Please, refer to [our report](https://wandb.ai/psuraj/dreambooth/reports/Dreambooth-Training-Analysis--VmlldzoyNzk0NDc3) for more details.
+
+To enable this option, pass the `--train_text_encoder` argument to the training script.
+
+<Tip>
+Training the text encoder requires additional memory, so training won't fit on a 16GB GPU. You'll need at least 24GB VRAM to use this option.
+</Tip>
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path_to_training_images"
+export CLASS_DIR="path_to_class_images"
+export OUTPUT_DIR="path_to_saved_model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --train_text_encoder \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --use_8bit_adam
+  --gradient_checkpointing \
+  --learning_rate=2e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800
+```
+
+### Training on a 8 GB GPU:
+
+Using [DeepSpeed](https://www.deepspeed.ai/) it's even possible to offload some
+tensors from VRAM to either CPU or NVME, allowing training to proceed with less GPU memory.
+
+DeepSpeed needs to be enabled with `accelerate config`. During configuration,
+answer yes to "Do you want to use DeepSpeed?". Combining DeepSpeed stage 2, fp16
+mixed precision, and offloading both the model parameters and the optimizer state to CPU, it's
+possible to train on under 8 GB VRAM. The drawback is that this requires more system RAM (about 25 GB). See [the DeepSpeed documentation](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) for more configuration options.
+
+Changing the default Adam optimizer to DeepSpeed's special version of Adam
+`deepspeed.ops.adam.DeepSpeedCPUAdam` gives a substantial speedup, but enabling
+it requires the system's CUDA toolchain version to be the same as the one installed with PyTorch. 8-bit optimizers don't seem to be compatible with DeepSpeed at the moment.
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export INSTANCE_DIR="path_to_training_images"
+export CLASS_DIR="path_to_class_images"
+export OUTPUT_DIR="path_to_saved_model"
+
+accelerate launch train_dreambooth.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="a photo of sks dog" \
+  --class_prompt="a photo of dog" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --sample_batch_size=1 \
+  --gradient_accumulation_steps=1 --gradient_checkpointing \
+  --learning_rate=5e-6 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=800 \
+  --mixed_precision=fp16
+```
+
+## Inference
+
+Once you have trained a model, inference can be done using the `StableDiffusionPipeline`, by simply indicating the path where the model was saved. Make sure that your prompts include the special `identifier` used during training (`sks` in the previous examples).
+
+```python
+from diffusers import StableDiffusionPipeline
+import torch
+
+model_id = "path_to_saved_model"
+pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+
+prompt = "A photo of sks dog in a bucket"
+image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
+
+image.save("dog-bucket.png")
+```
diff --git a/docs/source/training/overview.mdx b/docs/source/training/overview.mdx
index c403298493e5..9b36117ca43a 100644
--- a/docs/source/training/overview.mdx
+++ b/docs/source/training/overview.mdx
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
 
 # 🧨 Diffusers Training Examples
 
-Diffusers examples are a collection of scripts to demonstrate how to effectively use the `diffusers` library
+Diffusers training examples are a collection of scripts to demonstrate how to effectively use the `diffusers` library
 for a variety of use cases.
 
 **Note**: If you are looking for **official** examples on how to use `diffusers` for inference, 
@@ -36,13 +36,15 @@ Training examples show how to pretrain or fine-tune diffusion models for a varie
 - [Unconditional Training](./unconditional_training)
 - [Text-to-Image Training](./text2image)
 - [Text Inversion](./text_inversion)
+- [Dreambooth](./dreambooth)
 
 
 | Task | 🤗 Accelerate | 🤗 Datasets | Colab
 |---|---|:---:|:---:|
 | [**Unconditional Image Generation**](./unconditional_training) | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
-| [**Text-to-Image**](./text2image) | - | - | 
-| [**Text-Inversion**](./text_inversion) | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
+| [**Text-to-Image fine-tuning**](./text2image) | ✅ | ✅ | 
+| [**Textual Inversion**](./text_inversion) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
+| [**Dreambooth**](./dreambooth) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb)
 
 ## Community
 
diff --git a/docs/source/training/text2image.mdx b/docs/source/training/text2image.mdx
index dcbdece429a9..1b04462f77f1 100644
--- a/docs/source/training/text2image.mdx
+++ b/docs/source/training/text2image.mdx
@@ -11,6 +11,128 @@ specific language governing permissions and limitations under the License.
 -->
 
 
-# Text-to-Image Training
+# Stable Diffusion text-to-image fine-tuning
 
-Under construction 🚧
+The [`train_text_to_image.py`](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion) script shows how to fine-tune the stable diffusion model on your own dataset.
+
+<Tip warning={true}>
+
+The text-to-image fine-tuning script is experimental. It's easy to overfit and run into issues like catastrophic forgetting. We recommend to explore different hyperparameters to get the best results on your dataset.
+
+</Tip>
+
+
+## Running locally 
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install git+https://github.com/huggingface/diffusers.git
+pip install -U -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. 
+
+You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
+
+Run the following command to authenticate your token
+
+```bash
+huggingface-cli login
+```
+
+If you have already cloned the repo, then you won't need to go through these steps. Instead, you can pass the path to your local checkout to the training script and it will be loaded from there.
+
+### Hardware Requirements for Fine-tuning
+
+Using `gradient_checkpointing` and `mixed_precision` it should be possible to fine tune the model on a single 24GB GPU. For higher `batch_size` and faster training it's better to use GPUs with more than 30GB of GPU memory. You can also use JAX / Flax for fine-tuning on TPUs or GPUs, see [below](#flax-jax-finetuning) for details.
+
+### Fine-tuning Example
+
+The following script will launch a fine-tuning run using [Justin Pinkneys' captioned Pokemon dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions), available in Hugging Face Hub.
+
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export dataset_name="lambdalabs/pokemon-blip-captions"
+
+accelerate launch train_text_to_image.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$dataset_name \
+  --use_ema \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --mixed_precision="fp16" \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --output_dir="sd-pokemon-model" 
+```
+
+To run on your own training files you need to prepare the dataset according to the format required by `datasets`. You can upload your dataset to the Hub, or you can prepare a local folder with your files. [This documentation](https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder-with-metadata) explains how to do it.
+
+You should modify the script if you wish to use custom loading logic. We have left pointers in the code in the appropriate places :)
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export TRAIN_DIR="path_to_your_dataset"
+export OUTPUT_DIR="path_to_save_model"
+
+accelerate launch train_text_to_image.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$TRAIN_DIR \
+  --use_ema \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --mixed_precision="fp16" \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --output_dir=${OUTPUT_DIR}
+```
+
+Once training is finished the model will be saved to the `OUTPUT_DIR` specified in the command. To load the fine-tuned model for inference, just pass that path to `StableDiffusionPipeline`:
+
+```python
+from diffusers import StableDiffusionPipeline
+
+model_path = "path_to_saved_model"
+pipe = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16)
+pipe.to("cuda")
+
+image = pipe(prompt="yoda").images[0]
+image.save("yoda-pokemon.png")
+```
+
+### Flax / JAX fine-tuning
+
+Thanks to [@duongna211](https://github.com/duongna21) it's possible to fine-tune Stable Diffusion using Flax! This is very efficient on TPU hardware but works great on GPUs too. You can use the [Flax training script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_flax.py) like this:
+
+```Python
+export MODEL_NAME="runwayml/stable-diffusion-v1-5"
+export dataset_name="lambdalabs/pokemon-blip-captions"
+
+python train_text_to_image_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$dataset_name \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --output_dir="sd-pokemon-model" 
+```
diff --git a/docs/source/training/text_inversion.mdx b/docs/source/training/text_inversion.mdx
index d44cef301ed7..7bc145299eac 100644
--- a/docs/source/training/text_inversion.mdx
+++ b/docs/source/training/text_inversion.mdx
@@ -49,7 +49,7 @@ The `textual_inversion.py` script [here](https://github.com/huggingface/diffuser
 
 ### Installing the dependencies
 
-Before running the scripts, make sure to install the library's training dependencies:
+Before running the scripts, make sure to install the library's training dependencies.
 
 ```bash
 pip install diffusers[training] accelerate transformers
diff --git a/examples/README.md b/examples/README.md
index 2680b638d585..56b1b81b9060 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -16,7 +16,7 @@ limitations under the License.
 # 🧨 Diffusers Examples
 
 Diffusers examples are a collection of scripts to demonstrate how to effectively use the `diffusers` library
-for a variety of use cases.
+for a variety of use cases involving training or fine-tuning.
 
 **Note**: If you are looking for **official** examples on how to use `diffusers` for inference, 
 please have a look at [src/diffusers/pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines)
@@ -38,7 +38,11 @@ Training examples show how to pretrain or fine-tune diffusion models for a varie
 
 | Task | 🤗 Accelerate | 🤗 Datasets | Colab
 |---|---|:---:|:---:|
-| [**Unconditional Image Generation**](https://github.com/huggingface/diffusers/blob/main/examples/unconditional_image_generation/train_unconditional.py) | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [**Unconditional Image Generation**](./unconditional_training) | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [**Text-to-Image fine-tuning**](./text2image) | ✅ | ✅ | 
+| [**Textual Inversion**](./text_inversion) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
+| [**Dreambooth**](./dreambooth) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb)
+
 
 ## Community
 
diff --git a/examples/text_to_image/README.md b/examples/text_to_image/README.md
index f7c8cdcb4ccb..170ed384f13c 100644
--- a/examples/text_to_image/README.md
+++ b/examples/text_to_image/README.md
@@ -4,7 +4,7 @@ The `train_text_to_image.py` script shows how to fine-tune stable diffusion mode
 
 ___Note___:
 
-___This script is experimental. The script fine-tunes the whole model and often times the model overifits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparamters to get the best result on your dataset.___
+___This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparamters to get the best result on your dataset.___
 
 
 ## Running locally with PyTorch

From fc0ca47456ac81e12224014958c7620938c9c6f6 Mon Sep 17 00:00:00 2001
From: Minwoo Byeon <mwbyun@hotmail.com>
Date: Sat, 29 Oct 2022 16:26:23 +0900
Subject: [PATCH 082/269] Fix speedup ratio in fp16.mdx (#837)

---
 docs/source/optimization/fp16.mdx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/optimization/fp16.mdx b/docs/source/optimization/fp16.mdx
index b4ffed31c76b..f12c067ba5ee 100644
--- a/docs/source/optimization/fp16.mdx
+++ b/docs/source/optimization/fp16.mdx
@@ -18,9 +18,9 @@ We present some techniques and ideas to optimize 🤗 Diffusers _inference_ for
 | ---------------- | ------- | ------- |
 | original         | 9.50s   | x1      |
 | cuDNN auto-tuner | 9.37s   | x1.01   |
-| autocast (fp16)  | 5.47s   | x1.91   |
-| fp16             | 3.61s   | x2.91   |
-| channels last    | 3.30s   | x2.87   |
+| autocast (fp16)  | 5.47s   | x1.74   |
+| fp16             | 3.61s   | x2.63   |
+| channels last    | 3.30s   | x2.88   |
 | traced UNet      | 3.21s   | x2.96   |
 
 <em>

From 12fd0736dcc51f77c52130ab10177d0c1d5a29d9 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathan@huggingface.co>
Date: Sat, 29 Oct 2022 00:28:26 -0700
Subject: [PATCH 083/269] clean incomplete pages (#1008)

---
 docs/source/api/schedulers.mdx              |  6 +++--
 docs/source/conceptual/stable_diffusion.mdx |  4 +--
 docs/source/installation.mdx                |  2 +-
 src/diffusers/models/README.md              | 10 +-------
 src/diffusers/pipelines/README.md           | 27 +++++++++++----------
 src/diffusers/schedulers/README.md          | 16 +-----------
 6 files changed, 22 insertions(+), 43 deletions(-)

diff --git a/docs/source/api/schedulers.mdx b/docs/source/api/schedulers.mdx
index 80fbfdcb2321..3f88e563de19 100644
--- a/docs/source/api/schedulers.mdx
+++ b/docs/source/api/schedulers.mdx
@@ -89,7 +89,7 @@ Original implementation can be found [here](https://github.com/crowsonkb/k-diffu
 
 [[autodoc]] PNDMScheduler
 
-#### variance exploding stochastic differential equation (SDE) scheduler
+#### variance exploding stochastic differential equation (VE-SDE) scheduler
 
 Original paper can be found [here](https://arxiv.org/abs/2011.13456).
 
@@ -99,7 +99,9 @@ Original paper can be found [here](https://arxiv.org/abs/2011.13456).
 
 Original implementation can be found [here](https://github.com/crowsonkb/v-diffusion-pytorch/blob/987f8985e38208345c1959b0ea767a625831cc9b/diffusion/sampling.py#L296).
 
-#### variance preserving stochastic differential equation (SDE) scheduler
+[[autodoc]] IPNDMScheduler
+
+#### variance preserving stochastic differential equation (VP-SDE) scheduler
 
 Original paper can be found [here](https://arxiv.org/abs/2011.13456).
 
diff --git a/docs/source/conceptual/stable_diffusion.mdx b/docs/source/conceptual/stable_diffusion.mdx
index c00359d2ac1a..3e87674aa83a 100644
--- a/docs/source/conceptual/stable_diffusion.mdx
+++ b/docs/source/conceptual/stable_diffusion.mdx
@@ -12,6 +12,4 @@ specific language governing permissions and limitations under the License.
 
 # Stable Diffusion
 
-Under construction 🚧
-
-For now please visit this [very in-detail blog post](https://huggingface.co/blog/stable_diffusion)
+Please visit this [very in-detail blog post](https://huggingface.co/blog/stable_diffusion) on Stable Diffusion!
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index fa9466b6d1aa..1c9460eca8bd 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -53,7 +53,7 @@ The `main` version is useful for staying up-to-date with the latest developments
 For instance, if a bug has been fixed since the last official release but a new release hasn't been rolled out yet.
 However, this means the `main` version may not always be stable.
 We strive to keep the `main` version operational, and most issues are usually resolved within a few hours or a day.
-If you run into a problem, please open an [Issue](https://github.com/huggingface/transformers/issues) so we can fix it even sooner!
+If you run into a problem, please open an [Issue](https://github.com/huggingface/transformers/issues), so we can fix it even sooner!
 
 ## Editable install
 
diff --git a/src/diffusers/models/README.md b/src/diffusers/models/README.md
index e786fe518f03..80fe0bc38140 100644
--- a/src/diffusers/models/README.md
+++ b/src/diffusers/models/README.md
@@ -1,11 +1,3 @@
 # Models
 
-- Models: Neural network that models $p_\theta(\mathbf{x}_{t-1}|\mathbf{x}_t)$ (see image below) and is trained end-to-end to denoise a noisy input to an image. Examples: UNet, Conditioned UNet, 3D UNet, Transformer UNet
-
-## API
-
-TODO(Suraj, Patrick)
-
-## Examples
-
-TODO(Suraj, Patrick)
+For more detail on the models, please refer to the [docs](https://huggingface.co/docs/diffusers/api/models).
\ No newline at end of file
diff --git a/src/diffusers/pipelines/README.md b/src/diffusers/pipelines/README.md
index 66d3a8815d4e..86048eb5a052 100644
--- a/src/diffusers/pipelines/README.md
+++ b/src/diffusers/pipelines/README.md
@@ -30,19 +30,20 @@ If you are looking for *official* training examples, please have a look at [exam
 The following table summarizes all officially supported pipelines, their corresponding paper, and if 
 available a colab notebook to directly try them out.
 
-| Pipeline | Paper | Tasks | Colab
-|---|---|:---:|:---:|
-| [ddpm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | *Unconditional Image Generation* |
-| [ddim](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | *Unconditional Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
-| [latent_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| *Text-to-Image Generation* | 
-| [latent_diffusion_uncond](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | *Unconditional Image Generation* | 
-| [pndm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | *Unconditional Image Generation* | 
-| [score_sde_ve](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | *Unconditional Image Generation* | 
-| [score_sde_vp](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | *Unconditional Image Generation* | 
-| [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | *Text-to-Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
-| [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | *Image-to-Image Text-Guided Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
-| [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | *Text-Guided Image Inpainting* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
-| [stochastic_karras_ve](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | *Unconditional Image Generation* | 
+| Pipeline                                                                                                                      | Source                                                                                                                       | Tasks | Colab
+|-------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------|:---:|:---:|
+| [dance diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/dance_diffusion)                 | [**Dance Diffusion**](https://github.com/Harmonai-org/sample-generator)                                                      | *Unconditional Audio Generation* |
+| [ddpm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/ddpm)                                       | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239)                                             | *Unconditional Image Generation* |
+| [ddim](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/ddim)                                       | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502)                                                  | *Unconditional Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [latent_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion)               | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)                         | *Text-to-Image Generation* | 
+| [latent_diffusion_uncond](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)                         | *Unconditional Image Generation* | 
+| [pndm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pndm)                                       | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778)                           | *Unconditional Image Generation* | 
+| [score_sde_ve](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/score_sde_ve)                       | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | *Unconditional Image Generation* | 
+| [score_sde_vp](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/score_sde_vp)                       | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | *Unconditional Image Generation* | 
+| [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | *Text-to-Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | *Image-to-Image Text-Guided Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+| [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | *Text-Guided Image Inpainting* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+| [stochastic_karras_ve](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stochastic_karras_ve)       | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364)                    | *Unconditional Image Generation* | 
 
 **Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers. 
 However, most of them can be adapted to use different scheduler components or even different model components. Some pipeline examples are shown in the [Examples](#examples) below.
diff --git a/src/diffusers/schedulers/README.md b/src/diffusers/schedulers/README.md
index 6a01c503a909..9494e357fd43 100644
--- a/src/diffusers/schedulers/README.md
+++ b/src/diffusers/schedulers/README.md
@@ -1,17 +1,3 @@
 # Schedulers
 
-- Schedulers are the algorithms to use diffusion models in inference as well as for training. They include the noise schedules and define algorithm-specific diffusion steps.
-- Schedulers can be used interchangeable between diffusion models in inference to find the preferred trade-off between speed and generation quality.
-- Schedulers are available in PyTorch and Jax.
-
-## API
-
-- Schedulers should provide one or more `def step(...)` functions that should be called iteratively to unroll the diffusion loop during 
-the forward pass.
-- Schedulers should be framework specific.
-
-## Examples
-
-- The DDPM scheduler was proposed in [Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2006.11239) and can be found in [scheduling_ddpm.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_ddpm.py). An example of how to use this scheduler can be found in [pipeline_ddpm.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/ddpm/pipeline_ddpm.py).
-- The DDIM scheduler was proposed in [Denoising Diffusion Implicit Models](https://arxiv.org/abs/2010.02502) and can be found in [scheduling_ddim.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_ddim.py). An example of how to use this scheduler can be found in [pipeline_ddim.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/ddim/pipeline_ddim.py).
-- The PNDM scheduler was proposed in [Pseudo Numerical Methods for Diffusion Models on Manifolds](https://arxiv.org/abs/2202.09778) and can be found in [scheduling_pndm.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_pndm.py). An example of how to use this scheduler can be found in [pipeline_pndm.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pndm/pipeline_pndm.py).
+For more information on the schedulers, please refer to the [docs](https://huggingface.co/docs/diffusers/api/schedulers).
\ No newline at end of file

From 1fc208825d815a77c1a1176684b1b9c4d4aa0978 Mon Sep 17 00:00:00 2001
From: MarkRich <mark.rich388@gmail.com>
Date: Sat, 29 Oct 2022 00:31:42 -0700
Subject: [PATCH 084/269] Add seed resizing to community pipelines (#1011)

* add seed resizing to community examples

* actually add the file responsible for seed resizing
---
 examples/community/README.md                  |  86 ++++
 .../community/seed_resize_stable_diffusion.py | 366 ++++++++++++++++++
 2 files changed, 452 insertions(+)
 create mode 100644 examples/community/seed_resize_stable_diffusion.py

diff --git a/examples/community/README.md b/examples/community/README.md
index 3f92f202d8c4..2ef84291fc7d 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -16,6 +16,8 @@ If a community doesn't work as expected, please open an issue and ping the autho
 | Speech to Image                        | Using automatic-speech-recognition to transcribe text and Stable Diffusion to generate images                                                                                                                                                                                                                                                                                                                                                                                                            | [Speech to Image](#speech-to-image)                               | -                                                                                                                                                                                                                  | [Mikail Duzenli](https://github.com/MikailINTech)
 | Wild Card Stable Diffusion | Stable Diffusion Pipeline that supports prompts that contain wildcard terms (indicated by surrounding double underscores), with values instantiated randomly from a corresponding txt file or a dictionary of possible values                                                                                                                                                                                                                                                                                                     | [Wildcard Stable Diffusion](#wildcard-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Shyam Sudhakaran](https://github.com/shyamsn97) |
 | Composable Stable Diffusion| Stable Diffusion Pipeline that supports prompts that contain "&#124;" in prompts (as an AND condition) and weights (separated by "&#124;" as well) to positively / negatively weight prompts.                                                                                                                                                                                                                                                                                                     | [Composable Stable Diffusion](#composable-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
+| Seed Resizing Stable Diffusion| Stable Diffusion Pipeline that supports resizing an image and retaining the concepts of the 512 by 512 generation.                                                                                                                                                                                                                                                                                                     | [Seed Resizing](#seed-resizing)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
+
 
 
 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
@@ -371,3 +373,87 @@ for i in range(4):
 for i, img in enumerate(images):
     img.save(f"./composable_diffusion/image_{i}.png")
 ```
+### Seed Resizing 
+Test seed resizing. Originally generate an image in 512 by 512, then generate image with same seed at 512 by 592 using seed resizing. Finally, generate 512 by 592 using original stable diffusion pipeline.
+
+```python
+import torch as th
+import numpy as np
+from diffusers import DiffusionPipeline
+
+has_cuda = th.cuda.is_available()
+device = th.device('cpu' if not has_cuda else 'cuda')
+
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    use_auth_token=True,
+    custom_pipeline="seed_resize_stable_diffusion"
+).to(device)
+
+def dummy(images, **kwargs):
+    return images, False
+
+pipe.safety_checker = dummy
+
+
+images = []
+th.manual_seed(0)
+generator = th.Generator("cuda").manual_seed(0)
+
+seed = 0
+prompt = "A painting of a futuristic cop"
+
+width = 512
+height = 512
+
+res = pipe(
+    prompt,
+    guidance_scale=7.5,
+    num_inference_steps=50,
+    height=height,
+    width=width,
+    generator=generator)
+image = res.images[0]
+image.save('./seed_resize/seed_resize_{w}_{h}_image.png'.format(w=width, h=height))
+
+
+th.manual_seed(0)
+generator = th.Generator("cuda").manual_seed(0)
+
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    use_auth_token=True,
+    custom_pipeline="/home/mark/open_source/diffusers/examples/community/"
+).to(device)
+
+width = 512
+height = 592
+
+res = pipe(
+    prompt,
+    guidance_scale=7.5,
+    num_inference_steps=50,
+    height=height,
+    width=width,
+    generator=generator)
+image = res.images[0]
+image.save('./seed_resize/seed_resize_{w}_{h}_image.png'.format(w=width, h=height))
+
+pipe_compare = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    use_auth_token=True,
+    custom_pipeline="/home/mark/open_source/diffusers/examples/community/"
+).to(device)
+
+res = pipe_compare(
+    prompt,
+    guidance_scale=7.5,
+    num_inference_steps=50,
+    height=height,
+    width=width,
+    generator=generator
+)
+
+image = res.images[0]
+image.save('./seed_resize/seed_resize_{w}_{h}_image_compare.png'.format(w=width, h=height))
+```
\ No newline at end of file
diff --git a/examples/community/seed_resize_stable_diffusion.py b/examples/community/seed_resize_stable_diffusion.py
new file mode 100644
index 000000000000..f912663a687d
--- /dev/null
+++ b/examples/community/seed_resize_stable_diffusion.py
@@ -0,0 +1,366 @@
+"""
+    modified based on diffusion library from Huggingface: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+"""
+import inspect
+from typing import Callable, List, Optional, Union
+
+import torch
+
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import logging
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class SeedResizeStableDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        text_embeddings: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+        if text_embeddings is None:
+            text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""]
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
+        latents_shape_reference = (batch_size * num_images_per_prompt, self.unet.in_channels, 64, 64)
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not exist on mps
+                latents_reference = torch.randn(
+                    latents_shape_reference, generator=generator, device="cpu", dtype=latents_dtype
+                ).to(self.device)
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                latents_reference = torch.randn(
+                    latents_shape_reference, generator=generator, device=self.device, dtype=latents_dtype
+                )
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+        else:
+            if latents_reference.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents_reference = latents_reference.to(self.device)
+            latents = latents.to(self.device)
+
+        # This is the key part of the pipeline where we
+        # try to ensure that the generated images w/ the same seed
+        # but different sizes actually result in similar images
+        dx = (latents_shape[3] - latents_shape_reference[3]) // 2
+        dy = (latents_shape[2] - latents_shape_reference[2]) // 2
+        w = latents_shape_reference[3] if dx >= 0 else latents_shape_reference[3] + 2 * dx
+        h = latents_shape_reference[2] if dy >= 0 else latents_shape_reference[2] + 2 * dy
+        tx = 0 if dx < 0 else dx
+        ty = 0 if dy < 0 else dy
+        dx = max(-dx, 0)
+        dy = max(-dy, 0)
+        # import pdb
+        # pdb.set_trace()
+        latents[:, :, ty : ty + h, tx : tx + w] = latents_reference[:, :, dy : dy + h, dx : dx + w]
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
+                self.device
+            )
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+            )
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

From a59f9990fc1251ada20e86dfa615b04360bd013f Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Sat, 29 Oct 2022 20:27:00 +0200
Subject: [PATCH 085/269] Tests: upgrade PyTorch cuda to 11.7 to fix examples
 tests. (#1048)

Tests: upgrade PyTorch cuda to 11.7.

Otherwise the cuda versions of torch and torchvision mismatch, and
examples tests fail. We were requesting cuda 11.6 for PyTorch, and the
default torchvision (via setup.py).

Another option would be to include torchvision in the same pip install
line as torch.
---
 .github/workflows/push_tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index dfd83aa9af46..f1fc5484a25a 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -34,7 +34,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         python -m pip uninstall -y torch torchvision torchtext
-        python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu116
+        python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu117
         python -m pip install -e .[quality,test]
         python -m pip install git+https://github.com/huggingface/accelerate
 
@@ -80,7 +80,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         python -m pip uninstall -y torch torchvision torchtext
-        python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu116
+        python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu117
         python -m pip install -e .[quality,test,training]
         python -m pip install git+https://github.com/huggingface/accelerate
 

From 95414bd6bf9bb34a312a7c55f10ba9b379f33890 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Sat, 29 Oct 2022 21:09:32 +0200
Subject: [PATCH 086/269] Experimental: allow fp16 in `mps` (#961)

* Docs: refer to pre-RC version of PyTorch 1.13.0.

* Remove temporary workaround for unavailable op.

* Update comment to make it less ambiguous.

* Remove use of contiguous in mps.

It appears to not longer be necessary.

* Special case: use einsum for much better performance in mps

* Update mps docs.

* MPS: make pipeline work in half precision.
---
 src/diffusers/models/attention.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index dce30c6a4aa6..bf04c3e6a3ca 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -376,6 +376,12 @@ def __init__(self, dim_in: int, dim_out: int):
         super().__init__()
         self.proj = nn.Linear(dim_in, dim_out * 2)
 
+    def gelu(self, gate):
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+
     def forward(self, hidden_states):
         hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
-        return hidden_states * F.gelu(gate)
+        return hidden_states * self.gelu(gate)

From 8e4fd686e0b45ac2406677b08ac3dca79aa290b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonatan=20K=C5=82osko?= <jonatanklosko@gmail.com>
Date: Sun, 30 Oct 2022 20:07:55 +0100
Subject: [PATCH 087/269] Move safety detection to model call in Flax safety
 checker (#1023)

* Move safety detection to model call in Flax safety checker

* Update src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py
---
 .../pipeline_flax_stable_diffusion.py         | 41 +++++++----
 .../stable_diffusion/safety_checker_flax.py   | 73 +++----------------
 2 files changed, 37 insertions(+), 77 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
index e4f56d94dac8..fe0e284c6720 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
@@ -1,3 +1,4 @@
+import warnings
 from functools import partial
 from typing import Dict, List, Optional, Union
 
@@ -97,9 +98,9 @@ def prepare_inputs(self, prompt: Union[str, List[str]]):
         )
         return text_input.input_ids
 
-    def _get_safety_scores(self, features, params):
-        special_cos_dist, cos_dist = self.safety_checker(features, params)
-        return (special_cos_dist, cos_dist)
+    def _get_has_nsfw_concepts(self, features, params):
+        has_nsfw_concepts = self.safety_checker(features, params)
+        return has_nsfw_concepts
 
     def _run_safety_checker(self, images, safety_model_params, jit=False):
         # safety_model_params should already be replicated when jit is True
@@ -108,20 +109,28 @@ def _run_safety_checker(self, images, safety_model_params, jit=False):
 
         if jit:
             features = shard(features)
-            special_cos_dist, cos_dist = _p_get_safety_scores(self, features, safety_model_params)
-            special_cos_dist = unshard(special_cos_dist)
-            cos_dist = unshard(cos_dist)
+            has_nsfw_concepts = _p_get_has_nsfw_concepts(self, features, safety_model_params)
+            has_nsfw_concepts = unshard(has_nsfw_concepts)
             safety_model_params = unreplicate(safety_model_params)
         else:
-            special_cos_dist, cos_dist = self._get_safety_scores(features, safety_model_params)
+            has_nsfw_concepts = self._get_has_nsfw_concepts(features, safety_model_params)
 
-        images, has_nsfw = self.safety_checker.filtered_with_scores(
-            special_cos_dist,
-            cos_dist,
-            images,
-            safety_model_params,
-        )
-        return images, has_nsfw
+        images_was_copied = False
+        for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
+            if has_nsfw_concept:
+                if not images_was_copied:
+                    images_was_copied = True
+                    images = images.copy()
+
+                images[idx] = np.zeros(images[idx].shape, dtype=np.uint8)  # black image
+
+            if any(has_nsfw_concepts):
+                warnings.warn(
+                    "Potential NSFW content was detected in one or more images. A black image will be returned"
+                    " instead. Try again with a different prompt and/or seed."
+                )
+
+        return images, has_nsfw_concepts
 
     def _generate(
         self,
@@ -310,8 +319,8 @@ def _p_generate(
 
 
 @partial(jax.pmap, static_broadcasted_argnums=(0,))
-def _p_get_safety_scores(pipe, features, params):
-    return pipe._get_safety_scores(features, params)
+def _p_get_has_nsfw_concepts(pipe, features, params):
+    return pipe._get_has_nsfw_concepts(features, params)
 
 
 def unshard(x: jnp.ndarray):
diff --git a/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py b/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py
index b3cd8eef02fa..e4ea381a8c9b 100644
--- a/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py
+++ b/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py
@@ -1,8 +1,5 @@
-import warnings
 from typing import Optional, Tuple
 
-import numpy as np
-
 import jax
 import jax.numpy as jnp
 from flax import linen as nn
@@ -39,56 +36,22 @@ def __call__(self, clip_input):
 
         special_cos_dist = jax_cosine_distance(image_embeds, self.special_care_embeds)
         cos_dist = jax_cosine_distance(image_embeds, self.concept_embeds)
-        return special_cos_dist, cos_dist
-
-    def filtered_with_scores(self, special_cos_dist, cos_dist, images):
-        batch_size = special_cos_dist.shape[0]
-        special_cos_dist = np.asarray(special_cos_dist)
-        cos_dist = np.asarray(cos_dist)
-
-        result = []
-        for i in range(batch_size):
-            result_img = {"special_scores": {}, "special_care": [], "concept_scores": {}, "bad_concepts": []}
-
-            # increase this value to create a stronger `nfsw` filter
-            # at the cost of increasing the possibility of filtering benign image inputs
-            adjustment = 0.0
-
-            for concept_idx in range(len(special_cos_dist[0])):
-                concept_cos = special_cos_dist[i][concept_idx]
-                concept_threshold = self.special_care_embeds_weights[concept_idx].item()
-                result_img["special_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
-                if result_img["special_scores"][concept_idx] > 0:
-                    result_img["special_care"].append({concept_idx, result_img["special_scores"][concept_idx]})
-                    adjustment = 0.01
-
-            for concept_idx in range(len(cos_dist[0])):
-                concept_cos = cos_dist[i][concept_idx]
-                concept_threshold = self.concept_embeds_weights[concept_idx].item()
-                result_img["concept_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
-                if result_img["concept_scores"][concept_idx] > 0:
-                    result_img["bad_concepts"].append(concept_idx)
-
-            result.append(result_img)
 
-        has_nsfw_concepts = [len(res["bad_concepts"]) > 0 for res in result]
+        # increase this value to create a stronger `nfsw` filter
+        # at the cost of increasing the possibility of filtering benign image inputs
+        adjustment = 0.0
 
-        images_was_copied = False
-        for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
-            if has_nsfw_concept:
-                if not images_was_copied:
-                    images_was_copied = True
-                    images = images.copy()
+        special_scores = special_cos_dist - self.special_care_embeds_weights[None, :] + adjustment
+        special_scores = jnp.round(special_scores, 3)
+        is_special_care = jnp.any(special_scores > 0, axis=1, keepdims=True)
+        # Use a lower threshold if an image has any special care concept
+        special_adjustment = is_special_care * 0.01
 
-                images[idx] = np.zeros(images[idx].shape)  # black image
+        concept_scores = cos_dist - self.concept_embeds_weights[None, :] + special_adjustment
+        concept_scores = jnp.round(concept_scores, 3)
+        has_nsfw_concepts = jnp.any(concept_scores > 0, axis=1)
 
-            if any(has_nsfw_concepts):
-                warnings.warn(
-                    "Potential NSFW content was detected in one or more images. A black image will be returned"
-                    " instead. Try again with a different prompt and/or seed."
-                )
-
-        return images, has_nsfw_concepts
+        return has_nsfw_concepts
 
 
 class FlaxStableDiffusionSafetyChecker(FlaxPreTrainedModel):
@@ -133,15 +96,3 @@ def __call__(
             jnp.array(clip_input, dtype=jnp.float32),
             rngs={},
         )
-
-    def filtered_with_scores(self, special_cos_dist, cos_dist, images, params: dict = None):
-        def _filtered_with_scores(module, special_cos_dist, cos_dist, images):
-            return module.filtered_with_scores(special_cos_dist, cos_dist, images)
-
-        return self.module.apply(
-            {"params": params or self.params},
-            special_cos_dist,
-            cos_dist,
-            images,
-            method=_filtered_with_scores,
-        )

From 707b8684b3476cde8f63718f7e9b2daf3612302d Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 31 Oct 2022 09:13:37 +0000
Subject: [PATCH 088/269] fix slow test

---
 .../pipelines/stable_diffusion/test_stable_diffusion_inpaint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index b577436f4e5d..e63738ae8a22 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -349,7 +349,7 @@ def test_stable_diffusion_inpaint_pipeline_fp16(self):
         image = output.images[0]
 
         assert image.shape == (512, 512, 3)
-        assert np.abs(expected_image - image).max() < 1e-2
+        assert np.abs(expected_image - image).max() < 5e-1
 
     def test_stable_diffusion_inpaint_pipeline_pndm(self):
         init_image = load_image(

From 1606eb994a754cc512dfa08d926e199851abc9be Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Mon, 31 Oct 2022 13:38:43 +0100
Subject: [PATCH 089/269] Fix pipelines user_agent, ignore CI requests (#1058)

* Fix pipelines user_agent, ignore CI requests

* fix circular import

* N/A versions

* N/A versions
---
 .github/workflows/pr_tests.yml       |  1 +
 .github/workflows/push_tests.yml     |  1 +
 src/diffusers/hub_utils.py           | 46 +++++++++++++++++++++++++---
 src/diffusers/pipeline_flax_utils.py |  9 ++++++
 src/diffusers/pipeline_utils.py      | 10 ++++--
 src/diffusers/utils/import_utils.py  |  4 ++-
 6 files changed, 63 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
index 81c75fecec6e..75ab6c7c5f2a 100644
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -10,6 +10,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
+  DIFFUSERS_IS_CI: yes
   OMP_NUM_THREADS: 8
   MKL_NUM_THREADS: 8
   PYTEST_TIMEOUT: 60
diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index f1fc5484a25a..397fecb560f0 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -6,6 +6,7 @@ on:
       - main
 
 env:
+  DIFFUSERS_IS_CI: yes
   HF_HOME: /mnt/cache
   OMP_NUM_THREADS: 8
   MKL_NUM_THREADS: 8
diff --git a/src/diffusers/hub_utils.py b/src/diffusers/hub_utils.py
index 1f8cc0db0fc9..8bf0933a1dce 100644
--- a/src/diffusers/hub_utils.py
+++ b/src/diffusers/hub_utils.py
@@ -16,13 +16,25 @@
 
 import os
 import shutil
+import sys
 from pathlib import Path
-from typing import Optional
+from typing import Dict, Optional, Union
+from uuid import uuid4
 
 from huggingface_hub import HfFolder, Repository, whoami
 
-from .pipeline_utils import DiffusionPipeline
-from .utils import deprecate, is_modelcards_available, logging
+from . import __version__
+from .utils import ENV_VARS_TRUE_VALUES, deprecate, logging
+from .utils.import_utils import (
+    _flax_version,
+    _jax_version,
+    _onnxruntime_version,
+    _torch_version,
+    is_flax_available,
+    is_modelcards_available,
+    is_onnx_available,
+    is_torch_available,
+)
 
 
 if is_modelcards_available():
@@ -33,6 +45,32 @@
 
 
 MODEL_CARD_TEMPLATE_PATH = Path(__file__).parent / "utils" / "model_card_template.md"
+SESSION_ID = uuid4().hex
+DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY", "").upper() in ENV_VARS_TRUE_VALUES
+
+
+def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
+    """
+    Formats a user-agent string with basic info about a request.
+    """
+    ua = f"diffusers/{__version__}; python/{sys.version.split()[0]}; session_id/{SESSION_ID}"
+    if DISABLE_TELEMETRY:
+        return ua + "; telemetry/off"
+    if is_torch_available():
+        ua += f"; torch/{_torch_version}"
+    if is_flax_available():
+        ua += f"; jax/{_jax_version}"
+        ua += f"; flax/{_flax_version}"
+    if is_onnx_available():
+        ua += f"; onnxruntime/{_onnxruntime_version}"
+    # CI will set this value to True
+    if os.environ.get("DIFFUSERS_IS_CI", "").upper() in ENV_VARS_TRUE_VALUES:
+        ua += "; is_ci/true"
+    if isinstance(user_agent, dict):
+        ua += "; " + "; ".join(f"{k}/{v}" for k, v in user_agent.items())
+    elif isinstance(user_agent, str):
+        ua += "; " + user_agent
+    return ua
 
 
 def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
@@ -101,7 +139,7 @@ def init_git_repo(args, at_init: bool = False):
 
 def push_to_hub(
     args,
-    pipeline: DiffusionPipeline,
+    pipeline,
     repo: Repository,
     commit_message: Optional[str] = "End of training",
     blocking: bool = True,
diff --git a/src/diffusers/pipeline_flax_utils.py b/src/diffusers/pipeline_flax_utils.py
index e96c0c7467f3..f0330531e17f 100644
--- a/src/diffusers/pipeline_flax_utils.py
+++ b/src/diffusers/pipeline_flax_utils.py
@@ -29,6 +29,7 @@
 from tqdm.auto import tqdm
 
 from .configuration_utils import ConfigMixin
+from .hub_utils import http_user_agent
 from .modeling_flax_utils import FLAX_WEIGHTS_NAME, FlaxModelMixin
 from .schedulers.scheduling_utils_flax import SCHEDULER_CONFIG_NAME, FlaxSchedulerMixin
 from .utils import CONFIG_NAME, DIFFUSERS_CACHE, BaseOutput, is_transformers_available, logging
@@ -301,6 +302,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             allow_patterns = [os.path.join(k, "*") for k in folder_names]
             allow_patterns += [FLAX_WEIGHTS_NAME, SCHEDULER_CONFIG_NAME, CONFIG_NAME, cls.config_name]
 
+            if cls != FlaxDiffusionPipeline:
+                requested_pipeline_class = cls.__name__
+            else:
+                requested_pipeline_class = config_dict.get("_class_name", cls.__name__)
+            user_agent = {"pipeline_class": requested_pipeline_class}
+            user_agent = http_user_agent(user_agent)
+
             # download all allow_patterns
             cached_folder = snapshot_download(
                 pretrained_model_name_or_path,
@@ -311,6 +319,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 use_auth_token=use_auth_token,
                 revision=revision,
                 allow_patterns=allow_patterns,
+                user_agent=user_agent,
             )
         else:
             cached_folder = pretrained_model_name_or_path
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index c0a44363a2f9..92623f9f9245 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -30,9 +30,9 @@
 from PIL import Image
 from tqdm.auto import tqdm
 
-from . import __version__
 from .configuration_utils import ConfigMixin
 from .dynamic_modules_utils import get_class_from_dynamic_module
+from .hub_utils import http_user_agent
 from .schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from .utils import (
     CONFIG_NAME,
@@ -398,10 +398,14 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             if custom_pipeline is not None:
                 allow_patterns += [CUSTOM_PIPELINE_FILE_NAME]
 
-            requested_pipeline_class = config_dict.get("_class_name", cls.__name__)
-            user_agent = {"diffusers": __version__, "pipeline_class": requested_pipeline_class}
+            if cls != DiffusionPipeline:
+                requested_pipeline_class = cls.__name__
+            else:
+                requested_pipeline_class = config_dict.get("_class_name", cls.__name__)
+            user_agent = {"pipeline_class": requested_pipeline_class}
             if custom_pipeline is not None:
                 user_agent["custom_pipeline"] = custom_pipeline
+            user_agent = http_user_agent(user_agent)
 
             # download all allow_patterns
             cached_folder = snapshot_download(
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index b2aabee70c92..2a5f7f64dd07 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -90,7 +90,8 @@
     logger.info("Disabling Tensorflow because USE_TORCH is set")
     _tf_available = False
 
-
+_jax_version = "N/A"
+_flax_version = "N/A"
 if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
     _flax_available = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("flax") is not None
     if _flax_available:
@@ -136,6 +137,7 @@
     _modelcards_available = False
 
 
+_onnxruntime_version = "N/A"
 _onnx_available = importlib.util.find_spec("onnxruntime") is not None
 if _onnx_available:
     candidates = ("onnxruntime", "onnxruntime-gpu", "onnxruntime-directml", "onnxruntime-openvino")

From e4d264e4eb23de860220c87a934f4dfb41879923 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 31 Oct 2022 14:06:03 +0100
Subject: [PATCH 090/269] [GitBot] Automatically close issues after inactivitiy
 (#1079)

* [GitBot] Automatically close issues after inactivitiy

* improve

* Add unstale

* typo

Co-authored-by: anton-l <anton@huggingface.co>
---
 utils/stale.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/utils/stale.py b/utils/stale.py
index 4162875a62aa..a0193a913ee5 100644
--- a/utils/stale.py
+++ b/utils/stale.py
@@ -42,11 +42,27 @@ def main():
         last_comment = comments[0] if len(comments) > 0 else None
         if (
             last_comment is not None
+            and last_comment.user.login == "github-actions[bot]"
+            and (dt.utcnow() - issue.updated_at).days > 7
+            and (dt.utcnow() - issue.created_at).days >= 30
+            and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
+        ):
+            # Closes the issue after 7 days of inactivity since the Stalebot notification.
+            issue.edit(state="closed")
+        elif (
+            "stale" in issue.get_labels()
+            and last_comment is not None
             and last_comment.user.login != "github-actions[bot]"
-            and (dt.utcnow() - issue.updated_at).days > 23
+        ):
+            # Opens the issue if someone other than Stalebot commented.
+            issue.edit(state="open")
+            issue.remove_from_labels("stale")
+        elif (
+            (dt.utcnow() - issue.updated_at).days > 23
             and (dt.utcnow() - issue.created_at).days >= 30
             and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
         ):
+            # Post a Stalebot notification after 23 days of inactivity.
             issue.create_comment(
                 "This issue has been automatically marked as stale because it has not had "
                 "recent activity. If you think this still needs to be addressed "
@@ -54,7 +70,7 @@ def main():
                 "[contributing guidelines](https://github.com/huggingface/diffusers/blob/main/CONTRIBUTING.md) "
                 "are likely to be ignored."
             )
-            issue.edit(labels=["stale"])
+            issue.add_to_labels("stale")
 
 
 if __name__ == "__main__":

From bf7b0bc25b4a10da78ae9dd19ae00ba840585d40 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Mon, 31 Oct 2022 15:03:33 +0100
Subject: [PATCH 091/269] Allow `safety_checker` to be `None` when using CPU
 offload (#1078)

Allow None safety_checker when using CPU offload.
---
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 2a33074a2473..098e8e6a85c9 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -133,7 +133,8 @@ def enable_sequential_cpu_offload(self):
         device = torch.device("cuda")
 
         for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
-            cpu_offload(cpu_offloaded_model, device)
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
 
     @torch.no_grad()
     def __call__(

From a1ea8c01c31a44bf48f6a3b85ccabeb45ef6418f Mon Sep 17 00:00:00 2001
From: hlky <106811348+hlky@users.noreply.github.com>
Date: Mon, 31 Oct 2022 15:20:38 +0000
Subject: [PATCH 092/269] k-diffusion-euler (#1019)

* k-diffusion-euler

* make style make quality

* make fix-copies

* fix tests for euler a

* Update src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py

Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>

* Update src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py

Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>

* Update src/diffusers/schedulers/scheduling_euler_discrete.py

Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>

* Update src/diffusers/schedulers/scheduling_euler_discrete.py

Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>

* remove unused arg and method

* update doc

* quality

* make flake happy

* use logger instead of warn

* raise error instead of deprication

* don't require scipy

* pass generator in step

* fix tests

* Apply suggestions from code review

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

* Update tests/test_scheduler.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* remove unused generator

* pass generator as extra_step_kwargs

* update tests

* pass generator as kwarg

* pass generator as kwarg

* quality

* fix test for lms

* fix tests

Co-authored-by: patil-suraj <surajp815@gmail.com>
Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>
Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/__init__.py                     |   2 +
 .../pipeline_stable_diffusion.py              |  17 +-
 .../pipeline_stable_diffusion_img2img.py      |  17 +-
 .../pipeline_stable_diffusion_inpaint.py      |   5 +
 ...ipeline_stable_diffusion_inpaint_legacy.py |   5 +
 src/diffusers/schedulers/__init__.py          |   2 +
 .../scheduling_euler_ancestral_discrete.py    | 261 +++++++++++++++++
 .../schedulers/scheduling_euler_discrete.py   | 270 ++++++++++++++++++
 src/diffusers/utils/dummy_pt_objects.py       |  30 ++
 .../stable_diffusion/test_stable_diffusion.py |  92 ++++++
 tests/test_scheduler.py                       | 169 ++++++++++-
 11 files changed, 858 insertions(+), 12 deletions(-)
 create mode 100644 src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
 create mode 100644 src/diffusers/schedulers/scheduling_euler_discrete.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 2c531cf8cee0..49c3e82b8e7b 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -41,6 +41,8 @@
     from .schedulers import (
         DDIMScheduler,
         DDPMScheduler,
+        EulerAncestralDiscreteScheduler,
+        EulerDiscreteScheduler,
         IPNDMScheduler,
         KarrasVeScheduler,
         PNDMScheduler,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 098e8e6a85c9..e64f50b38993 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -9,7 +9,13 @@
 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...pipeline_utils import DiffusionPipeline
-from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...schedulers import (
+    DDIMScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
 from ...utils import deprecate, logging
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
@@ -52,7 +58,9 @@ def __init__(
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
         unet: UNet2DConditionModel,
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        scheduler: Union[
+            DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler, EulerDiscreteScheduler, EulerAncestralDiscreteScheduler
+        ],
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
     ):
@@ -334,6 +342,11 @@ def __call__(
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index db8134834564..799b1c19f6c1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -10,7 +10,13 @@
 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...pipeline_utils import DiffusionPipeline
-from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...schedulers import (
+    DDIMScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
 from ...utils import deprecate, logging
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
@@ -63,7 +69,9 @@ def __init__(
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
         unet: UNet2DConditionModel,
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        scheduler: Union[
+            DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler, EulerDiscreteScheduler, EulerAncestralDiscreteScheduler
+        ],
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
     ):
@@ -335,6 +343,11 @@ def __call__(
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+
         latents = init_latents
 
         t_start = max(num_inference_steps - init_timestep + offset, 0)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index d49f55aa921d..77bd55f2744d 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -379,6 +379,11 @@ def __call__(
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 71a2be62128c..55472f34cf68 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -352,6 +352,11 @@ def __call__(
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+
         latents = init_latents
 
         t_start = max(num_inference_steps - init_timestep + offset, 0)
diff --git a/src/diffusers/schedulers/__init__.py b/src/diffusers/schedulers/__init__.py
index f179acde09e2..c3999d2cac61 100644
--- a/src/diffusers/schedulers/__init__.py
+++ b/src/diffusers/schedulers/__init__.py
@@ -19,6 +19,8 @@
 if is_torch_available():
     from .scheduling_ddim import DDIMScheduler
     from .scheduling_ddpm import DDPMScheduler
+    from .scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler
+    from .scheduling_euler_discrete import EulerDiscreteScheduler
     from .scheduling_ipndm import IPNDMScheduler
     from .scheduling_karras_ve import KarrasVeScheduler
     from .scheduling_pndm import PNDMScheduler
diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
new file mode 100644
index 000000000000..b153dfbe2f02
--- /dev/null
+++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -0,0 +1,261 @@
+# Copyright 2022 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, deprecate, logging
+from .scheduling_utils import SchedulerMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->EulerAncestralDiscrete
+class EulerAncestralDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's step function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample (x_{0}) based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.FloatTensor
+    pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Ancestral sampling with Euler method steps. Based on the original k-diffusion implementation by Katherine Crowson:
+    https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+    [`~ConfigMixin.from_config`] functions.
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`np.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[np.ndarray] = None,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.from_numpy(trained_betas)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = (
+                torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+            )
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
+        self.sigmas = torch.from_numpy(sigmas)
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = self.sigmas.max()
+
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.is_scale_input_called = False
+
+    def scale_model_input(
+        self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor]
+    ) -> torch.FloatTensor:
+        """
+        Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+
+        Args:
+            sample (`torch.FloatTensor`): input sample
+            timestep (`float` or `torch.FloatTensor`): the current timestep in the diffusion chain
+
+        Returns:
+            `torch.FloatTensor`: scaled input sample
+        """
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.to(self.timesteps.device)
+        step_index = (self.timesteps == timestep).nonzero().item()
+        sigma = self.sigmas[step_index]
+        sample = sample / ((sigma**2 + 1) ** 0.5)
+        self.is_scale_input_called = True
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, optional):
+                the device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+        self.sigmas = torch.from_numpy(sigmas).to(device=device)
+        self.timesteps = torch.from_numpy(timesteps).to(device=device)
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+            timestep (`float`): current timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+            generator (`torch.Generator`, optional): Random number generator.
+            return_dict (`bool`): option for returning tuple rather than EulerAncestralDiscreteSchedulerOutput class
+
+        Returns:
+            [`~schedulers.scheduling_utils.EulerAncestralDiscreteSchedulerOutput`] or `tuple`:
+            [`~schedulers.scheduling_utils.EulerAncestralDiscreteSchedulerOutput`] if `return_dict` is True, otherwise
+            a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                " one of the `scheduler.timesteps` as a timestep.",
+            )
+
+        if not self.is_scale_input_called:
+            logger.warn(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.to(self.timesteps.device)
+
+        step_index = (self.timesteps == timestep).nonzero().item()
+        sigma = self.sigmas[step_index]
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        pred_original_sample = sample - sigma * model_output
+        sigma_from = self.sigmas[step_index]
+        sigma_to = self.sigmas[step_index + 1]
+        sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+        sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+
+        # 2. Convert to an ODE derivative
+        derivative = (sample - pred_original_sample) / sigma
+
+        dt = sigma_down - sigma
+
+        prev_sample = sample + derivative * dt
+
+        device = model_output.device if torch.is_tensor(model_output) else "cpu"
+        noise = torch.randn(model_output.shape, dtype=model_output.dtype, generator=generator).to(device)
+        prev_sample = prev_sample + noise * sigma_up
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return EulerAncestralDiscreteSchedulerOutput(
+            prev_sample=prev_sample, pred_original_sample=pred_original_sample
+        )
+
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        self.sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            self.timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            self.timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        schedule_timesteps = self.timesteps
+
+        if isinstance(timesteps, torch.IntTensor) or isinstance(timesteps, torch.LongTensor):
+            deprecate(
+                "timesteps as indices",
+                "0.8.0",
+                "Passing integer indices  (e.g. from `enumerate(timesteps)`) as timesteps to"
+                " `EulerAncestralDiscreteScheduler.add_noise()` will not be supported in future versions. Make sure to"
+                " pass values from `scheduler.timesteps` as timesteps.",
+                standard_warn=False,
+            )
+            step_indices = timesteps
+        else:
+            step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = self.sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
new file mode 100644
index 000000000000..a71fbda13df6
--- /dev/null
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -0,0 +1,270 @@
+# Copyright 2022 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, deprecate, logging
+from .scheduling_utils import SchedulerMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->EulerDiscrete
+class EulerDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's step function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample (x_{0}) based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.FloatTensor
+    pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Euler scheduler (Algorithm 2) from Karras et al. (2022) https://arxiv.org/abs/2206.00364. . Based on the original
+    k-diffusion implementation by Katherine Crowson:
+    https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L51
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+    [`~ConfigMixin.from_config`] functions.
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`np.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[np.ndarray] = None,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.from_numpy(trained_betas)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = (
+                torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+            )
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
+        self.sigmas = torch.from_numpy(sigmas)
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = self.sigmas.max()
+
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.is_scale_input_called = False
+
+    def scale_model_input(
+        self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor]
+    ) -> torch.FloatTensor:
+        """
+        Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+
+        Args:
+            sample (`torch.FloatTensor`): input sample
+            timestep (`float` or `torch.FloatTensor`): the current timestep in the diffusion chain
+
+        Returns:
+            `torch.FloatTensor`: scaled input sample
+        """
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.to(self.timesteps.device)
+        step_index = (self.timesteps == timestep).nonzero().item()
+        sigma = self.sigmas[step_index]
+        sample = sample / ((sigma**2 + 1) ** 0.5)
+        self.is_scale_input_called = True
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, optional):
+                the device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+        self.sigmas = torch.from_numpy(sigmas).to(device=device)
+        self.timesteps = torch.from_numpy(timesteps).to(device=device)
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        s_churn: float = 0.0,
+        s_tmin: float = 0.0,
+        s_tmax: float = float("inf"),
+        s_noise: float = 1.0,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[EulerDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+            timestep (`float`): current timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+            s_churn (`float`)
+            s_tmin  (`float`)
+            s_tmax  (`float`)
+            s_noise (`float`)
+            generator (`torch.Generator`, optional): Random number generator.
+            return_dict (`bool`): option for returning tuple rather than EulerDiscreteSchedulerOutput class
+
+        Returns:
+            [`~schedulers.scheduling_utils.EulerDiscreteSchedulerOutput`] or `tuple`:
+            [`~schedulers.scheduling_utils.EulerDiscreteSchedulerOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                " one of the `scheduler.timesteps` as a timestep.",
+            )
+
+        if not self.is_scale_input_called:
+            logger.warn(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.to(self.timesteps.device)
+
+        step_index = (self.timesteps == timestep).nonzero().item()
+        sigma = self.sigmas[step_index]
+
+        gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
+
+        device = model_output.device if torch.is_tensor(model_output) else "cpu"
+        noise = torch.randn(model_output.shape, dtype=model_output.dtype, generator=generator).to(device)
+        eps = noise * s_noise
+        sigma_hat = sigma * (gamma + 1)
+
+        if gamma > 0:
+            sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        pred_original_sample = sample - sigma_hat * model_output
+
+        # 2. Convert to an ODE derivative
+        derivative = (sample - pred_original_sample) / sigma_hat
+
+        dt = self.sigmas[step_index + 1] - sigma_hat
+
+        prev_sample = sample + derivative * dt
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return EulerDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        self.sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            self.timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            self.timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        schedule_timesteps = self.timesteps
+
+        if isinstance(timesteps, torch.IntTensor) or isinstance(timesteps, torch.LongTensor):
+            deprecate(
+                "timesteps as indices",
+                "0.8.0",
+                "Passing integer indices  (e.g. from `enumerate(timesteps)`) as timesteps to"
+                " `EulerDiscreteScheduler.add_noise()` will not be supported in future versions. Make sure to"
+                " pass values from `scheduler.timesteps` as timesteps.",
+                standard_warn=False,
+            )
+            step_indices = timesteps
+        else:
+            step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+        sigma = self.sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 93834c76550b..5dd583279708 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -272,6 +272,36 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class EulerAncestralDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class EulerDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class IPNDMScheduler(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 6b23aca33bc6..c21d40f4c5da 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -24,6 +24,8 @@
 from diffusers import (
     AutoencoderKL,
     DDIMScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
     LMSDiscreteScheduler,
     PNDMScheduler,
     StableDiffusionPipeline,
@@ -361,6 +363,96 @@ def test_stable_diffusion_k_lms(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
+    def test_stable_diffusion_k_euler_ancestral(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = EulerAncestralDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.5067, 0.4689, 0.4614, 0.5233, 0.4903, 0.5112, 0.524, 0.5069, 0.4785])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_k_euler(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = EulerDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.5067, 0.4689, 0.4614, 0.5233, 0.4903, 0.5112, 0.524, 0.5069, 0.4785])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
     def test_stable_diffusion_attention_chunk(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         unet = self.dummy_cond_unet
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 194fb66f663f..9285eed20ff0 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import inspect
 import tempfile
 import unittest
 from typing import Dict, List, Tuple
@@ -22,6 +23,8 @@
 from diffusers import (
     DDIMScheduler,
     DDPMScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
     IPNDMScheduler,
     LMSDiscreteScheduler,
     PNDMScheduler,
@@ -77,7 +80,11 @@ def check_over_configs(self, time_step=0, **config):
 
         num_inference_steps = kwargs.pop("num_inference_steps", None)
 
+        # TODO(Suraj) - delete the following two lines once DDPM, DDIM, and PNDM have timesteps casted to float by default
         for scheduler_class in self.scheduler_classes:
+            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
+                time_step = float(time_step)
+
             sample = self.dummy_sample
             residual = 0.1 * sample
 
@@ -94,7 +101,13 @@ def check_over_configs(self, time_step=0, **config):
             elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
+            # Set the seed before step() as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.Generator().manual_seed(0)
             output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.Generator().manual_seed(0)
             new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
 
             assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
@@ -106,6 +119,9 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
         num_inference_steps = kwargs.pop("num_inference_steps", None)
 
         for scheduler_class in self.scheduler_classes:
+            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
+                time_step = float(time_step)
+
             sample = self.dummy_sample
             residual = 0.1 * sample
 
@@ -122,9 +138,12 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
             elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
-            torch.manual_seed(0)
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.Generator().manual_seed(0)
             output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-            torch.manual_seed(0)
+
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.Generator().manual_seed(0)
             new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
 
             assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
@@ -141,6 +160,10 @@ def test_from_pretrained_save_pretrained(self):
             scheduler_config = self.get_scheduler_config()
             scheduler = scheduler_class(**scheduler_config)
 
+            timestep = 1
+            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
+                timestep = float(timestep)
+
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
                 new_scheduler = scheduler_class.from_config(tmpdirname)
@@ -151,10 +174,13 @@ def test_from_pretrained_save_pretrained(self):
             elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
-            torch.manual_seed(0)
-            output = scheduler.step(residual, 1, sample, **kwargs).prev_sample
-            torch.manual_seed(0)
-            new_output = new_scheduler.step(residual, 1, sample, **kwargs).prev_sample
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.Generator().manual_seed(0)
+            output = scheduler.step(residual, timestep, sample, **kwargs).prev_sample
+
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.Generator().manual_seed(0)
+            new_output = new_scheduler.step(residual, timestep, sample, **kwargs).prev_sample
 
             assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
@@ -163,7 +189,14 @@ def test_step_shape(self):
 
         num_inference_steps = kwargs.pop("num_inference_steps", None)
 
+        timestep_0 = 0
+        timestep_1 = 1
+
         for scheduler_class in self.scheduler_classes:
+            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
+                timestep_0 = float(timestep_0)
+                timestep_1 = float(timestep_1)
+
             scheduler_config = self.get_scheduler_config()
             scheduler = scheduler_class(**scheduler_config)
 
@@ -175,8 +208,8 @@ def test_step_shape(self):
             elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
-            output_0 = scheduler.step(residual, 0, sample, **kwargs).prev_sample
-            output_1 = scheduler.step(residual, 1, sample, **kwargs).prev_sample
+            output_0 = scheduler.step(residual, timestep_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, timestep_1, sample, **kwargs).prev_sample
 
             self.assertEqual(output_0.shape, sample.shape)
             self.assertEqual(output_0.shape, output_1.shape)
@@ -216,6 +249,9 @@ def recursive_check(tuple_object, dict_object):
             timestep = 1
 
         for scheduler_class in self.scheduler_classes:
+            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
+                timestep = float(timestep)
+
             scheduler_config = self.get_scheduler_config()
             scheduler = scheduler_class(**scheduler_config)
 
@@ -227,6 +263,9 @@ def recursive_check(tuple_object, dict_object):
             elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
+            # Set the seed before state as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.Generator().manual_seed(0)
             outputs_dict = scheduler.step(residual, timestep, sample, **kwargs)
 
             if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
@@ -234,6 +273,9 @@ def recursive_check(tuple_object, dict_object):
             elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
                 kwargs["num_inference_steps"] = num_inference_steps
 
+            # Set the seed before state as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
+            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+                kwargs["generator"] = torch.Generator().manual_seed(0)
             outputs_tuple = scheduler.step(residual, timestep, sample, return_dict=False, **kwargs)
 
             recursive_check(outputs_tuple, outputs_dict)
@@ -933,6 +975,117 @@ def test_full_loop_no_noise(self):
         assert abs(result_mean.item() - 1.31) < 1e-3
 
 
+class EulerDiscreteSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (EulerDiscreteScheduler,)
+    num_inference_steps = 10
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1100,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+            "trained_betas": None,
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def test_timesteps(self):
+        for timesteps in [10, 50, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "scaled_linear"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_full_loop_no_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        generator = torch.Generator().manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+        print(result_sum, result_mean)
+
+        assert abs(result_sum.item() - 10.0807) < 1e-2
+        assert abs(result_mean.item() - 0.0131) < 1e-3
+
+
+class EulerAncestralDiscreteSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (EulerAncestralDiscreteScheduler,)
+    num_inference_steps = 10
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1100,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+            "trained_betas": None,
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def test_timesteps(self):
+        for timesteps in [10, 50, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_betas(self):
+        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
+            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
+
+    def test_schedules(self):
+        for schedule in ["linear", "scaled_linear"]:
+            self.check_over_configs(beta_schedule=schedule)
+
+    def test_full_loop_no_noise(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps)
+
+        generator = torch.Generator().manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+        print(result_sum, result_mean)
+        assert abs(result_sum.item() - 152.3192) < 1e-2
+        assert abs(result_mean.item() - 0.1983) < 1e-3
+
+
 class IPNDMSchedulerTest(SchedulerCommonTest):
     scheduler_classes = (IPNDMScheduler,)
     forward_default_kwargs = (("num_inference_steps", 50),)

From c18941b01ad0ea6b07d020f353d81153c632a374 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 31 Oct 2022 17:26:30 +0100
Subject: [PATCH 093/269] [Better scheduler docs] Improve usage examples of
 schedulers (#890)

* [Better scheduler docs] Improve usage examples of schedulers

* finish

* fix warnings and add test

* finish

* more replacements

* adapt fast tests hf token

* correct more

* Apply suggestions from code review

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

* Integrate compatibility with euler

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 .github/workflows/pr_tests.yml                |   4 +
 README.md                                     |   6 +-
 docs/source/quicktour.mdx                     |   2 +-
 examples/dreambooth/train_dreambooth.py       |   4 +-
 examples/text_to_image/train_text_to_image.py |  10 +-
 .../textual_inversion/textual_inversion.py    |  12 +-
 src/diffusers/configuration_utils.py          |  48 ++++-
 src/diffusers/pipeline_flax_utils.py          |   2 +-
 src/diffusers/pipeline_utils.py               |   4 +-
 .../pipelines/stable_diffusion/README.md      |   8 +-
 .../pipeline_onnx_stable_diffusion.py         |  29 +++
 .../pipeline_onnx_stable_diffusion_img2img.py |  13 ++
 .../pipeline_onnx_stable_diffusion_inpaint.py |  13 ++
 .../pipeline_stable_diffusion.py              |  13 ++
 .../pipeline_stable_diffusion_img2img.py      |  13 ++
 .../pipeline_stable_diffusion_inpaint.py      |  13 ++
 ...ipeline_stable_diffusion_inpaint_legacy.py |  13 ++
 src/diffusers/schedulers/scheduling_ddim.py   |   8 +
 src/diffusers/schedulers/scheduling_ddpm.py   |   8 +
 .../scheduling_euler_ancestral_discrete.py    |   8 +
 .../schedulers/scheduling_euler_discrete.py   |   8 +
 .../schedulers/scheduling_lms_discrete.py     |   8 +
 src/diffusers/schedulers/scheduling_pndm.py   |   8 +
 .../stable_diffusion/test_stable_diffusion.py |   8 +-
 .../test_stable_diffusion_img2img.py          |   3 +-
 .../test_stable_diffusion_inpaint.py          |   2 +-
 .../test_stable_diffusion_inpaint_legacy.py   |   3 +-
 tests/test_config.py                          | 184 ++++++++++++++++++
 28 files changed, 402 insertions(+), 53 deletions(-)

diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
index 75ab6c7c5f2a..163e24136f51 100644
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -42,6 +42,8 @@ jobs:
         python utils/print_env.py
 
     - name: Run all fast tests on CPU
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       run: |
         python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=tests_torch_cpu tests/
 
@@ -91,6 +93,8 @@ jobs:
 
     - name: Run all fast tests on MPS
       shell: arch -arch arm64 bash {0}
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       run: |
         ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps tests/
 
diff --git a/README.md b/README.md
index 2302137c1e72..03365a2a5006 100644
--- a/README.md
+++ b/README.md
@@ -142,11 +142,7 @@ it before the pipeline and pass it to `from_pretrained`.
 ```python
 from diffusers import LMSDiscreteScheduler
 
-lms = LMSDiscreteScheduler(
-    beta_start=0.00085, 
-    beta_end=0.012, 
-    beta_schedule="scaled_linear"
-)
+lms = LMSDiscreteScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
 
 pipe = StableDiffusionPipeline.from_pretrained(
     "runwayml/stable-diffusion-v1-5", 
diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx
index a652695f00ba..b5effdb847b0 100644
--- a/docs/source/quicktour.mdx
+++ b/docs/source/quicktour.mdx
@@ -121,7 +121,7 @@ you could use it as follows:
 ```python
 >>> from diffusers import LMSDiscreteScheduler
 
->>> scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+>>> scheduler = LMSDiscreteScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
 
 >>> generator = StableDiffusionPipeline.from_pretrained(
 ...     "runwayml/stable-diffusion-v1-5", scheduler=scheduler, use_auth_token=AUTH_TOKEN
diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index 8b0b20e07bb5..c37197fdc64c 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -469,9 +469,7 @@ def main(args):
         eps=args.adam_epsilon,
     )
 
-    noise_scheduler = DDPMScheduler(
-        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
-    )
+    noise_scheduler = DDPMScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
 
     train_dataset = DreamBoothDataset(
         instance_data_root=args.instance_data_dir,
diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py
index 859a891ecf37..cf7dac89330c 100644
--- a/examples/text_to_image/train_text_to_image.py
+++ b/examples/text_to_image/train_text_to_image.py
@@ -372,11 +372,7 @@ def main():
         weight_decay=args.adam_weight_decay,
         eps=args.adam_epsilon,
     )
-
-    # TODO (patil-suraj): load scheduler using args
-    noise_scheduler = DDPMScheduler(
-        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
-    )
+    noise_scheduler = DDPMScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
 
     # Get the datasets: you can either provide your own training and evaluation files (see below)
     # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
@@ -609,9 +605,7 @@ def collate_fn(examples):
             vae=vae,
             unet=unet,
             tokenizer=tokenizer,
-            scheduler=PNDMScheduler(
-                beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
-            ),
+            scheduler=PNDMScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler"),
             safety_checker=StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker"),
             feature_extractor=CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32"),
         )
diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py
index 18469af3d4ae..fc9380edcd73 100644
--- a/examples/textual_inversion/textual_inversion.py
+++ b/examples/textual_inversion/textual_inversion.py
@@ -419,13 +419,7 @@ def main():
         eps=args.adam_epsilon,
     )
 
-    # TODO (patil-suraj): load scheduler using args
-    noise_scheduler = DDPMScheduler(
-        beta_start=0.00085,
-        beta_end=0.012,
-        beta_schedule="scaled_linear",
-        num_train_timesteps=1000,
-    )
+    noise_scheduler = DDPMScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
 
     train_dataset = TextualInversionDataset(
         data_root=args.train_data_dir,
@@ -558,9 +552,7 @@ def main():
             vae=vae,
             unet=unet,
             tokenizer=tokenizer,
-            scheduler=PNDMScheduler(
-                beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
-            ),
+            scheduler=PNDMScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler"),
             safety_checker=StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker"),
             feature_extractor=CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32"),
         )
diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py
index 6df103823086..df18458c571d 100644
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -16,6 +16,7 @@
 """ ConfigMixin base class and utilities."""
 import dataclasses
 import functools
+import importlib
 import inspect
 import json
 import os
@@ -48,9 +49,13 @@ class ConfigMixin:
           [`~ConfigMixin.save_config`] (should be overridden by parent class).
         - **ignore_for_config** (`List[str]`) -- A list of attributes that should not be saved in the config (should be
           overridden by parent class).
+        - **_compatible_classes** (`List[str]`) -- A list of classes that are compatible with the parent class, so that
+          `from_config` can be used from a class different than the one used to save the config (should be overridden
+          by parent class).
     """
     config_name = None
     ignore_for_config = []
+    _compatible_classes = []
 
     def register_to_config(self, **kwargs):
         if self.config_name is None:
@@ -280,9 +285,14 @@ def get_config_dict(
 
         return config_dict
 
+    @staticmethod
+    def _get_init_keys(cls):
+        return set(dict(inspect.signature(cls.__init__).parameters).keys())
+
     @classmethod
     def extract_init_dict(cls, config_dict, **kwargs):
-        expected_keys = set(dict(inspect.signature(cls.__init__).parameters).keys())
+        # 1. Retrieve expected config attributes from __init__ signature
+        expected_keys = cls._get_init_keys(cls)
         expected_keys.remove("self")
         # remove general kwargs if present in dict
         if "kwargs" in expected_keys:
@@ -292,9 +302,36 @@ def extract_init_dict(cls, config_dict, **kwargs):
             for arg in cls._flax_internal_args:
                 expected_keys.remove(arg)
 
+        # 2. Remove attributes that cannot be expected from expected config attributes
         # remove keys to be ignored
         if len(cls.ignore_for_config) > 0:
             expected_keys = expected_keys - set(cls.ignore_for_config)
+
+        # load diffusers library to import compatible and original scheduler
+        diffusers_library = importlib.import_module(__name__.split(".")[0])
+
+        # remove attributes from compatible classes that orig cannot expect
+        compatible_classes = [getattr(diffusers_library, c, None) for c in cls._compatible_classes]
+        # filter out None potentially undefined dummy classes
+        compatible_classes = [c for c in compatible_classes if c is not None]
+        expected_keys_comp_cls = set()
+        for c in compatible_classes:
+            expected_keys_c = cls._get_init_keys(c)
+            expected_keys_comp_cls = expected_keys_comp_cls.union(expected_keys_c)
+        expected_keys_comp_cls = expected_keys_comp_cls - cls._get_init_keys(cls)
+        config_dict = {k: v for k, v in config_dict.items() if k not in expected_keys_comp_cls}
+
+        # remove attributes from orig class that cannot be expected
+        orig_cls_name = config_dict.pop("_class_name", cls.__name__)
+        if orig_cls_name != cls.__name__:
+            orig_cls = getattr(diffusers_library, orig_cls_name)
+            unexpected_keys_from_orig = cls._get_init_keys(orig_cls) - expected_keys
+            config_dict = {k: v for k, v in config_dict.items() if k not in unexpected_keys_from_orig}
+
+        # remove private attributes
+        config_dict = {k: v for k, v in config_dict.items() if not k.startswith("_")}
+
+        # 3. Create keyword arguments that will be passed to __init__ from expected keyword arguments
         init_dict = {}
         for key in expected_keys:
             if key in kwargs:
@@ -304,8 +341,7 @@ def extract_init_dict(cls, config_dict, **kwargs):
                 # use value from config dict
                 init_dict[key] = config_dict.pop(key)
 
-        config_dict = {k: v for k, v in config_dict.items() if not k.startswith("_")}
-
+        # 4. Give nice warning if unexpected values have been passed
         if len(config_dict) > 0:
             logger.warning(
                 f"The config attributes {config_dict} were passed to {cls.__name__}, "
@@ -313,14 +349,16 @@ def extract_init_dict(cls, config_dict, **kwargs):
                 f"{cls.config_name} configuration file."
             )
 
-        unused_kwargs = {**config_dict, **kwargs}
-
+        # 5. Give nice info if config attributes are initiliazed to default because they have not been passed
         passed_keys = set(init_dict.keys())
         if len(expected_keys - passed_keys) > 0:
             logger.info(
                 f"{expected_keys - passed_keys} was not found in config. Values will be initialized to default values."
             )
 
+        # 6. Define unused keyword arguments
+        unused_kwargs = {**config_dict, **kwargs}
+
         return init_dict, unused_kwargs
 
     @classmethod
diff --git a/src/diffusers/pipeline_flax_utils.py b/src/diffusers/pipeline_flax_utils.py
index f0330531e17f..80555f278505 100644
--- a/src/diffusers/pipeline_flax_utils.py
+++ b/src/diffusers/pipeline_flax_utils.py
@@ -272,7 +272,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         >>> # Download pipeline, but overwrite scheduler
         >>> from diffusers import LMSDiscreteScheduler
 
-        >>> scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+        >>> scheduler = LMSDiscreteScheduler.from_config("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
         >>> pipeline = FlaxDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", scheduler=scheduler)
         ```
         """
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 92623f9f9245..5c94df25cca0 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -360,7 +360,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         >>> # Download pipeline, but overwrite scheduler
         >>> from diffusers import LMSDiscreteScheduler
 
-        >>> scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+        >>> scheduler = LMSDiscreteScheduler.from_config("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
         >>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", scheduler=scheduler)
         ```
         """
@@ -602,7 +602,7 @@ def components(self) -> Dict[str, Any]:
         ...     StableDiffusionInpaintPipeline,
         ... )
 
-        >>> img2text = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+        >>> img2text = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
         >>> img2img = StableDiffusionImg2ImgPipeline(**img2text.components)
         >>> inpaint = StableDiffusionInpaintPipeline(**img2text.components)
         ```
diff --git a/src/diffusers/pipelines/stable_diffusion/README.md b/src/diffusers/pipelines/stable_diffusion/README.md
index 9c92e7b1a354..eaa441329a5c 100644
--- a/src/diffusers/pipelines/stable_diffusion/README.md
+++ b/src/diffusers/pipelines/stable_diffusion/README.md
@@ -72,7 +72,7 @@ image.save("astronaut_rides_horse.png")
 # make sure you're logged in with `huggingface-cli login`
 from diffusers import StableDiffusionPipeline, DDIMScheduler
 
-scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
+scheduler =  DDIMScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
 
 pipe = StableDiffusionPipeline.from_pretrained(
     "runwayml/stable-diffusion-v1-5", 
@@ -91,11 +91,7 @@ image.save("astronaut_rides_horse.png")
 # make sure you're logged in with `huggingface-cli login`
 from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
 
-lms = LMSDiscreteScheduler(
-    beta_start=0.00085, 
-    beta_end=0.012, 
-    beta_schedule="scaled_linear"
-)
+lms = LMSDiscreteScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
 
 pipe = StableDiffusionPipeline.from_pretrained(
     "runwayml/stable-diffusion-v1-5", 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
index a2cb99be421f..22f5bf6c432d 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
@@ -5,6 +5,7 @@
 
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 
+from ...configuration_utils import FrozenDict
 from ...onnx_utils import OnnxRuntimeModel
 from ...pipeline_utils import DiffusionPipeline
 from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
@@ -36,6 +37,34 @@ def __init__(
         feature_extractor: CLIPFeatureExtractor,
     ):
         super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
         self.register_modules(
             vae_encoder=vae_encoder,
             vae_decoder=vae_decoder,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
index b2105458b574..5e6b2e6f2fc3 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -90,6 +90,19 @@ def __init__(
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
         if safety_checker is None:
             logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
index 37374aa4b1bc..2ce9831a1676 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -104,6 +104,19 @@ def __init__(
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
         if safety_checker is None:
             logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index e64f50b38993..5927f36b12a1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -80,6 +80,19 @@ def __init__(
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
         if safety_checker is None:
             logger.warn(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 799b1c19f6c1..00c364f8e5e3 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -91,6 +91,19 @@ def __init__(
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
         if safety_checker is None:
             logger.warn(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 77bd55f2744d..39c8d16823af 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -90,6 +90,19 @@ def __init__(
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
         if safety_checker is None:
             logger.warn(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 55472f34cf68..6db3624177c6 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -96,6 +96,19 @@ def __init__(
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
         if safety_checker is None:
             logger.warn(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 58e40b57e923..f95c18d9fae2 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -109,6 +109,14 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
 
     """
 
+    _compatible_classes = [
+        "PNDMScheduler",
+        "DDPMScheduler",
+        "LMSDiscreteScheduler",
+        "EulerDiscreteScheduler",
+        "EulerAncestralDiscreteScheduler",
+    ]
+
     @register_to_config
     def __init__(
         self,
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index d51d58ac8f45..114a86b4320e 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -102,6 +102,14 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
 
     """
 
+    _compatible_classes = [
+        "DDIMScheduler",
+        "PNDMScheduler",
+        "LMSDiscreteScheduler",
+        "EulerDiscreteScheduler",
+        "EulerAncestralDiscreteScheduler",
+    ]
+
     @register_to_config
     def __init__(
         self,
diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
index b153dfbe2f02..3fe52a498044 100644
--- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -67,6 +67,14 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
 
     """
 
+    _compatible_classes = [
+        "DDIMScheduler",
+        "DDPMScheduler",
+        "LMSDiscreteScheduler",
+        "PNDMScheduler",
+        "EulerDiscreteScheduler",
+    ]
+
     @register_to_config
     def __init__(
         self,
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index a71fbda13df6..93aeb8cc3865 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -68,6 +68,14 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
 
     """
 
+    _compatible_classes = [
+        "DDIMScheduler",
+        "DDPMScheduler",
+        "LMSDiscreteScheduler",
+        "PNDMScheduler",
+        "EulerAncestralDiscreteScheduler",
+    ]
+
     @register_to_config
     def __init__(
         self,
diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py
index 6157f4b4dc65..43e577b40936 100644
--- a/src/diffusers/schedulers/scheduling_lms_discrete.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete.py
@@ -67,6 +67,14 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
 
     """
 
+    _compatible_classes = [
+        "DDIMScheduler",
+        "DDPMScheduler",
+        "PNDMScheduler",
+        "EulerDiscreteScheduler",
+        "EulerAncestralDiscreteScheduler",
+    ]
+
     @register_to_config
     def __init__(
         self,
diff --git a/src/diffusers/schedulers/scheduling_pndm.py b/src/diffusers/schedulers/scheduling_pndm.py
index 03ea5c15d1b0..0082ede787b8 100644
--- a/src/diffusers/schedulers/scheduling_pndm.py
+++ b/src/diffusers/schedulers/scheduling_pndm.py
@@ -88,6 +88,14 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin):
 
     """
 
+    _compatible_classes = [
+        "DDIMScheduler",
+        "DDPMScheduler",
+        "LMSDiscreteScheduler",
+        "EulerDiscreteScheduler",
+        "EulerAncestralDiscreteScheduler",
+    ]
+
     @register_to_config
     def __init__(
         self,
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index c21d40f4c5da..4e627846f2ec 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -644,13 +644,7 @@ def test_stable_diffusion_fast_ddim(self):
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
 
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
+        scheduler = DDIMScheduler.from_config("CompVis/stable-diffusion-v1-1", subfolder="scheduler")
         sd_pipe.scheduler = scheduler
 
         prompt = "A painting of a squirrel eating a burger"
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 7cbb9bc51e81..78d001e3c758 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -523,9 +523,8 @@ def test_stable_diffusion_img2img_pipeline_k_lms(self):
         init_image = init_image.resize((768, 512))
         expected_image = np.array(expected_image, dtype=np.float32) / 255.0
 
-        lms = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
-
         model_id = "CompVis/stable-diffusion-v1-4"
+        lms = LMSDiscreteScheduler.from_config(model_id, subfolder="scheduler")
         pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
             model_id,
             scheduler=lms,
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index e63738ae8a22..0a373ada68bc 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -366,8 +366,8 @@ def test_stable_diffusion_inpaint_pipeline_pndm(self):
         )
         expected_image = np.array(expected_image, dtype=np.float32) / 255.0
 
-        pndm = PNDMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True)
         model_id = "runwayml/stable-diffusion-inpainting"
+        pndm = PNDMScheduler.from_config(model_id, subfolder="scheduler")
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
             model_id, safety_checker=None, scheduler=pndm, device_map="auto"
         )
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
index 90dbf3b63604..d25342a35aea 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
@@ -407,9 +407,8 @@ def test_stable_diffusion_inpaint_legacy_pipeline_k_lms(self):
         )
         expected_image = np.array(expected_image, dtype=np.float32) / 255.0
 
-        lms = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
-
         model_id = "CompVis/stable-diffusion-v1-4"
+        lms = LMSDiscreteScheduler.from_config(model_id, subfolder="scheduler")
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
             model_id,
             scheduler=lms,
diff --git a/tests/test_config.py b/tests/test_config.py
index ab5b1e86bca8..1c6178d7208a 100755
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -13,10 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
+import os
 import tempfile
 import unittest
 
+import diffusers
+from diffusers import DDIMScheduler, EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, PNDMScheduler, logging
 from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils.testing_utils import CaptureLogger
 
 
 class SampleObject(ConfigMixin):
@@ -34,6 +39,37 @@ def __init__(
         pass
 
 
+class SampleObject2(ConfigMixin):
+    config_name = "config.json"
+
+    @register_to_config
+    def __init__(
+        self,
+        a=2,
+        b=5,
+        c=(2, 5),
+        d="for diffusion",
+        f=[1, 3],
+    ):
+        pass
+
+
+class SampleObject3(ConfigMixin):
+    config_name = "config.json"
+
+    @register_to_config
+    def __init__(
+        self,
+        a=2,
+        b=5,
+        c=(2, 5),
+        d="for diffusion",
+        e=[1, 3],
+        f=[1, 3],
+    ):
+        pass
+
+
 class ConfigTester(unittest.TestCase):
     def test_load_not_from_mixin(self):
         with self.assertRaises(ValueError):
@@ -97,3 +133,151 @@ def test_save_load(self):
         assert config.pop("c") == (2, 5)  # instantiated as tuple
         assert new_config.pop("c") == [2, 5]  # saved & loaded as list because of json
         assert config == new_config
+
+    def test_save_load_from_different_config(self):
+        obj = SampleObject()
+
+        # mock add obj class to `diffusers`
+        setattr(diffusers, "SampleObject", SampleObject)
+        logger = logging.get_logger("diffusers.configuration_utils")
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            obj.save_config(tmpdirname)
+            with CaptureLogger(logger) as cap_logger_1:
+                new_obj_1 = SampleObject2.from_config(tmpdirname)
+
+            # now save a config parameter that is not expected
+            with open(os.path.join(tmpdirname, SampleObject.config_name), "r") as f:
+                data = json.load(f)
+                data["unexpected"] = True
+
+            with open(os.path.join(tmpdirname, SampleObject.config_name), "w") as f:
+                json.dump(data, f)
+
+            with CaptureLogger(logger) as cap_logger_2:
+                new_obj_2 = SampleObject.from_config(tmpdirname)
+
+            with CaptureLogger(logger) as cap_logger_3:
+                new_obj_3 = SampleObject2.from_config(tmpdirname)
+
+        assert new_obj_1.__class__ == SampleObject2
+        assert new_obj_2.__class__ == SampleObject
+        assert new_obj_3.__class__ == SampleObject2
+
+        assert cap_logger_1.out == ""
+        assert (
+            cap_logger_2.out
+            == "The config attributes {'unexpected': True} were passed to SampleObject, but are not expected and will"
+            " be ignored. Please verify your config.json configuration file.\n"
+        )
+        assert cap_logger_2.out.replace("SampleObject", "SampleObject2") == cap_logger_3.out
+
+    def test_save_load_compatible_schedulers(self):
+        SampleObject2._compatible_classes = ["SampleObject"]
+        SampleObject._compatible_classes = ["SampleObject2"]
+
+        obj = SampleObject()
+
+        # mock add obj class to `diffusers`
+        setattr(diffusers, "SampleObject", SampleObject)
+        setattr(diffusers, "SampleObject2", SampleObject2)
+        logger = logging.get_logger("diffusers.configuration_utils")
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            obj.save_config(tmpdirname)
+
+            # now save a config parameter that is expected by another class, but not origin class
+            with open(os.path.join(tmpdirname, SampleObject.config_name), "r") as f:
+                data = json.load(f)
+                data["f"] = [0, 0]
+                data["unexpected"] = True
+
+            with open(os.path.join(tmpdirname, SampleObject.config_name), "w") as f:
+                json.dump(data, f)
+
+            with CaptureLogger(logger) as cap_logger:
+                new_obj = SampleObject.from_config(tmpdirname)
+
+        assert new_obj.__class__ == SampleObject
+
+        assert (
+            cap_logger.out
+            == "The config attributes {'unexpected': True} were passed to SampleObject, but are not expected and will"
+            " be ignored. Please verify your config.json configuration file.\n"
+        )
+
+    def test_save_load_from_different_config_comp_schedulers(self):
+        SampleObject3._compatible_classes = ["SampleObject", "SampleObject2"]
+        SampleObject2._compatible_classes = ["SampleObject", "SampleObject3"]
+        SampleObject._compatible_classes = ["SampleObject2", "SampleObject3"]
+
+        obj = SampleObject()
+
+        # mock add obj class to `diffusers`
+        setattr(diffusers, "SampleObject", SampleObject)
+        setattr(diffusers, "SampleObject2", SampleObject2)
+        setattr(diffusers, "SampleObject3", SampleObject3)
+        logger = logging.get_logger("diffusers.configuration_utils")
+        logger.setLevel(diffusers.logging.INFO)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            obj.save_config(tmpdirname)
+
+            with CaptureLogger(logger) as cap_logger_1:
+                new_obj_1 = SampleObject.from_config(tmpdirname)
+
+            with CaptureLogger(logger) as cap_logger_2:
+                new_obj_2 = SampleObject2.from_config(tmpdirname)
+
+            with CaptureLogger(logger) as cap_logger_3:
+                new_obj_3 = SampleObject3.from_config(tmpdirname)
+
+        assert new_obj_1.__class__ == SampleObject
+        assert new_obj_2.__class__ == SampleObject2
+        assert new_obj_3.__class__ == SampleObject3
+
+        assert cap_logger_1.out == ""
+        assert cap_logger_2.out == "{'f'} was not found in config. Values will be initialized to default values.\n"
+        assert cap_logger_3.out == "{'f'} was not found in config. Values will be initialized to default values.\n"
+
+    def test_load_ddim_from_pndm(self):
+        logger = logging.get_logger("diffusers.configuration_utils")
+
+        with CaptureLogger(logger) as cap_logger:
+            ddim = DDIMScheduler.from_config("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
+
+        assert ddim.__class__ == DDIMScheduler
+        # no warning should be thrown
+        assert cap_logger.out == ""
+
+    def test_load_ddim_from_euler(self):
+        logger = logging.get_logger("diffusers.configuration_utils")
+
+        with CaptureLogger(logger) as cap_logger:
+            euler = EulerDiscreteScheduler.from_config("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
+
+        assert euler.__class__ == EulerDiscreteScheduler
+        # no warning should be thrown
+        assert cap_logger.out == ""
+
+    def test_load_ddim_from_euler_ancestral(self):
+        logger = logging.get_logger("diffusers.configuration_utils")
+
+        with CaptureLogger(logger) as cap_logger:
+            euler = EulerAncestralDiscreteScheduler.from_config(
+                "runwayml/stable-diffusion-v1-5", subfolder="scheduler"
+            )
+
+        assert euler.__class__ == EulerAncestralDiscreteScheduler
+        # no warning should be thrown
+        assert cap_logger.out == ""
+
+    def test_load_pndm(self):
+        logger = logging.get_logger("diffusers.configuration_utils")
+
+        with CaptureLogger(logger) as cap_logger:
+            pndm = PNDMScheduler.from_config("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
+
+        assert pndm.__class__ == PNDMScheduler
+        # no warning should be thrown
+        assert cap_logger.out == ""

From 010bc4ea198eeaf379926316b214c22a1dab8d17 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 31 Oct 2022 16:35:59 +0000
Subject: [PATCH 094/269] incorrect model id

---
 docs/source/quicktour.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx
index b5effdb847b0..463780a0727f 100644
--- a/docs/source/quicktour.mdx
+++ b/docs/source/quicktour.mdx
@@ -121,7 +121,7 @@ you could use it as follows:
 ```python
 >>> from diffusers import LMSDiscreteScheduler
 
->>> scheduler = LMSDiscreteScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
+>>> scheduler = LMSDiscreteScheduler.from_config("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
 
 >>> generator = StableDiffusionPipeline.from_pretrained(
 ...     "runwayml/stable-diffusion-v1-5", scheduler=scheduler, use_auth_token=AUTH_TOKEN

From 17c2c0600beebc128f4153b76264118eb7750cf8 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 31 Oct 2022 18:59:58 +0100
Subject: [PATCH 095/269] [Tests] Fix slow tests (#1087)

---
 src/diffusers/configuration_utils.py            |  2 +-
 .../pipeline_stable_diffusion_inpaint.py        | 17 +++++++++--------
 .../stable_diffusion/test_stable_diffusion.py   |  9 +++++----
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py
index df18458c571d..d830857a302c 100644
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -323,7 +323,7 @@ def extract_init_dict(cls, config_dict, **kwargs):
 
         # remove attributes from orig class that cannot be expected
         orig_cls_name = config_dict.pop("_class_name", cls.__name__)
-        if orig_cls_name != cls.__name__:
+        if orig_cls_name != cls.__name__ and hasattr(diffusers_library, orig_cls_name):
             orig_cls = getattr(diffusers_library, orig_cls_name)
             unexpected_keys_from_orig = cls._get_init_keys(orig_cls) - expected_keys
             config_dict = {k: v for k, v in config_dict.items() if k not in unexpected_keys_from_orig}
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 39c8d16823af..57f9b65716ee 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -90,17 +90,18 @@ def __init__(
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+        if hasattr(scheduler.config, "skip_prk_steps") and scheduler.config.skip_prk_steps is False:
             deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration"
+                " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make"
+                " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to"
+                " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face"
+                " Hub, it would be very nice if you could open a Pull request for the"
+                " `scheduler/scheduler_config.json` file"
             )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            deprecate("skip_prk_steps not set", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
-            new_config["clip_sample"] = False
+            new_config["skip_prk_steps"] = True
             scheduler._internal_dict = FrozenDict(new_config)
 
         if safety_checker is None:
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 4e627846f2ec..260d58e94b04 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -640,13 +640,14 @@ def test_stable_diffusion(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
     def test_stable_diffusion_fast_ddim(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1", device_map="auto")
+        scheduler = DDIMScheduler.from_config("CompVis/stable-diffusion-v1-1", subfolder="scheduler")
+
+        sd_pipe = StableDiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-1", scheduler=scheduler, device_map="auto"
+        )
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
 
-        scheduler = DDIMScheduler.from_config("CompVis/stable-diffusion-v1-1", subfolder="scheduler")
-        sd_pipe.scheduler = scheduler
-
         prompt = "A painting of a squirrel eating a burger"
         generator = torch.Generator(device=torch_device).manual_seed(0)
 

From 888468dd90b4e353225d7d505fa28d2963e65678 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 31 Oct 2022 19:01:42 +0100
Subject: [PATCH 096/269] Remove nn sequential (#1086)

* Remove nn sequential

* up
---
 src/diffusers/models/attention.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index bf04c3e6a3ca..af441ef86181 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -244,7 +244,9 @@ def __init__(
         self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
         self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
 
-        self.to_out = nn.Sequential(nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(inner_dim, query_dim))
+        self.to_out.append(nn.Dropout(dropout))
 
     def reshape_heads_to_batch_dim(self, tensor):
         batch_size, seq_len, dim = tensor.shape
@@ -283,7 +285,11 @@ def forward(self, hidden_states, context=None, mask=None):
         else:
             hidden_states = self._sliced_attention(query, key, value, sequence_length, dim)
 
-        return self.to_out(hidden_states)
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states)
+        # dropout
+        hidden_states = self.to_out[1](hidden_states)
+        return hidden_states
 
     def _attention(self, query, key, value):
         # TODO: use baddbmm for better performance
@@ -354,12 +360,19 @@ def __init__(
         super().__init__()
         inner_dim = int(dim * mult)
         dim_out = dim_out if dim_out is not None else dim
-        project_in = GEGLU(dim, inner_dim)
+        self.net = nn.ModuleList([])
 
-        self.net = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
+        # project in
+        self.net.append(GEGLU(dim, inner_dim))
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out))
 
     def forward(self, hidden_states):
-        return self.net(hidden_states)
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
 
 
 # feedforward

From 7fb4b882b9618005035fdd70d78b312bbcd2a5ef Mon Sep 17 00:00:00 2001
From: Laurent Mazare <laurent.mazare@gmail.com>
Date: Mon, 31 Oct 2022 18:15:15 +0000
Subject: [PATCH 097/269] Remove some unused parameter in CrossAttnUpBlock2D
 (#1034)

Remove some unused parameter

The `downsample_padding` parameter does not seem to be used in `CrossAttnUpBlock2D` (or by any up block for that matter) so removing it.
---
 src/diffusers/models/unet_2d_blocks.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index f433eb5db06e..f4081c5c1cac 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -1053,7 +1053,6 @@ def __init__(
         cross_attention_dim=1280,
         attention_type="default",
         output_scale_factor=1.0,
-        downsample_padding=1,
         add_upsample=True,
     ):
         super().__init__()

From a793b1fe7e05adaa74f35193a6c3119265c6d889 Mon Sep 17 00:00:00 2001
From: MarkRich <mark.rich388@gmail.com>
Date: Tue, 1 Nov 2022 03:17:51 -0700
Subject: [PATCH 098/269] Add imagic to community pipelines (#958)

* initial commit to add imagic to stable diffusion community pipelines

* remove some testing changes

* comments from PR review for imagic stable diffusion

* remove changes from pipeline_stable_diffusion as part of imagic pipeline

* clean up example code and add line back in to pipeline_stable_diffusion for imagic pipeline

* remove unused functions

* small code quality changes for imagic pipeline

* clean up readme

* remove hardcoded logging values for imagic community example

* undo change for DDIMScheduler
---
 examples/community/README.md                  |  46 +-
 examples/community/imagic_stable_diffusion.py | 476 ++++++++++++++++++
 2 files changed, 521 insertions(+), 1 deletion(-)
 create mode 100644 examples/community/imagic_stable_diffusion.py

diff --git a/examples/community/README.md b/examples/community/README.md
index 2ef84291fc7d..bb3964e1a7c4 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -18,6 +18,7 @@ If a community doesn't work as expected, please open an issue and ping the autho
 | Composable Stable Diffusion| Stable Diffusion Pipeline that supports prompts that contain "&#124;" in prompts (as an AND condition) and weights (separated by "&#124;" as well) to positively / negatively weight prompts.                                                                                                                                                                                                                                                                                                     | [Composable Stable Diffusion](#composable-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
 | Seed Resizing Stable Diffusion| Stable Diffusion Pipeline that supports resizing an image and retaining the concepts of the 512 by 512 generation.                                                                                                                                                                                                                                                                                                     | [Seed Resizing](#seed-resizing)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
 
+| Imagic Stable Diffusion | Stable Diffusion Pipeline that enables writing a text prompt to edit an existing image| [Imagic Stable Diffusion](#imagic-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
 
 
 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
@@ -373,6 +374,49 @@ for i in range(4):
 for i, img in enumerate(images):
     img.save(f"./composable_diffusion/image_{i}.png")
 ```
+
+### Imagic Stable Diffusion
+Allows you to edit an image using stable diffusion. 
+
+```python
+import requests
+from PIL import Image
+from io import BytesIO
+import torch
+from diffusers import DiffusionPipeline, DDIMScheduler
+has_cuda = torch.cuda.is_available()
+device = torch.device('cpu' if not has_cuda else 'cuda')
+pipe = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+        safety_checker=None,
+    use_auth_token=True,
+    custom_pipeline="imagic_stable_diffusion",
+    scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
+).to(device)
+generator = th.Generator("cuda").manual_seed(0)
+seed = 0
+prompt = "A photo of Barack Obama smiling with a big grin"
+url = 'https://www.dropbox.com/s/6tlwzr73jd1r9yk/obama.png?dl=1'
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((512, 512))
+res = pipe.train(
+    prompt,
+    init_image,
+    guidance_scale=7.5,
+    num_inference_steps=50,
+    generator=generator)
+res = pipe(alpha=1)
+image = res.images[0]
+image.save('./imagic/imagic_image_alpha_1.png')
+res = pipe(alpha=1.5)
+image = res.images[0]
+image.save('./imagic/imagic_image_alpha_1_5.png')
+res = pipe(alpha=2)
+image = res.images[0]
+image.save('./imagic/imagic_image_alpha_2.png')
+```
+
 ### Seed Resizing 
 Test seed resizing. Originally generate an image in 512 by 512, then generate image with same seed at 512 by 592 using seed resizing. Finally, generate 512 by 592 using original stable diffusion pipeline.
 
@@ -456,4 +500,4 @@ res = pipe_compare(
 
 image = res.images[0]
 image.save('./seed_resize/seed_resize_{w}_{h}_image_compare.png'.format(w=width, h=height))
-```
\ No newline at end of file
+```
diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py
new file mode 100644
index 000000000000..92aa677b4626
--- /dev/null
+++ b/examples/community/imagic_stable_diffusion.py
@@ -0,0 +1,476 @@
+"""
+    modeled after the textual_inversion.py / train_dreambooth.py and the work
+    of justinpinkney here: https://github.com/justinpinkney/stable-diffusion/blob/main/notebooks/imagic.ipynb
+"""
+import inspect
+import warnings
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+import PIL
+from accelerate import Accelerator
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import logging
+from tqdm.auto import tqdm
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def preprocess(image):
+    w, h = image.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+
+class ImagicStableDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for imagic image editing.
+    See paper here: https://arxiv.org/pdf/2210.09276.pdf
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offsensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    def train(
+        self,
+        prompt: Union[str, List[str]],
+        init_image: Union[torch.FloatTensor, PIL.Image.Image],
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        generator: Optional[torch.Generator] = None,
+        embedding_learning_rate: float = 0.001,
+        diffusion_model_learning_rate: float = 2e-6,
+        text_embedding_optimization_steps: int = 500,
+        model_fine_tuning_optimization_steps: int = 1000,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `nd.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        accelerator = Accelerator(
+            gradient_accumulation_steps=1,
+            mixed_precision="fp16",
+        )
+
+        if "torch_device" in kwargs:
+            device = kwargs.pop("torch_device")
+            warnings.warn(
+                "`torch_device` is deprecated as an input argument to `__call__` and will be removed in v0.3.0."
+                " Consider using `pipe.to(torch_device)` instead."
+            )
+
+            if device is None:
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+            self.to(device)
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        # Freeze vae and unet
+        self.vae.requires_grad_(False)
+        self.unet.requires_grad_(False)
+        self.text_encoder.requires_grad_(False)
+        self.unet.eval()
+        self.vae.eval()
+        self.text_encoder.eval()
+
+        if accelerator.is_main_process:
+            accelerator.init_trackers(
+                "imagic",
+                config={
+                    "embedding_learning_rate": embedding_learning_rate,
+                    "text_embedding_optimization_steps": text_embedding_optimization_steps,
+                },
+            )
+
+        # get text embeddings for prompt
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncaton=True,
+            return_tensors="pt",
+        )
+        text_embeddings = torch.nn.Parameter(
+            self.text_encoder(text_input.input_ids.to(self.device))[0], requires_grad=True
+        )
+        text_embeddings = text_embeddings.detach()
+        text_embeddings.requires_grad_()
+        text_embeddings_orig = text_embeddings.clone()
+
+        # Initialize the optimizer
+        optimizer = torch.optim.Adam(
+            [text_embeddings],  # only optimize the embeddings
+            lr=embedding_learning_rate,
+        )
+
+        if isinstance(init_image, PIL.Image.Image):
+            init_image = preprocess(init_image)
+
+        latents_dtype = text_embeddings.dtype
+        init_image = init_image.to(device=self.device, dtype=latents_dtype)
+        init_latent_image_dist = self.vae.encode(init_image).latent_dist
+        init_image_latents = init_latent_image_dist.sample(generator=generator)
+        init_image_latents = 0.18215 * init_image_latents
+
+        progress_bar = tqdm(range(text_embedding_optimization_steps), disable=not accelerator.is_local_main_process)
+        progress_bar.set_description("Steps")
+
+        global_step = 0
+
+        logger.info("First optimizing the text embedding to better reconstruct the init image")
+        for _ in range(text_embedding_optimization_steps):
+            with accelerator.accumulate(text_embeddings):
+                # Sample noise that we'll add to the latents
+                noise = torch.randn(init_image_latents.shape).to(init_image_latents.device)
+                timesteps = torch.randint(1000, (1,), device=init_image_latents.device)
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps)
+
+                # Predict the noise residual
+                noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample
+
+                loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()
+                accelerator.backward(loss)
+
+                optimizer.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+            logs = {"loss": loss.detach().item()}  # , "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+        accelerator.wait_for_everyone()
+
+        text_embeddings.requires_grad_(False)
+
+        # Now we fine tune the unet to better reconstruct the image
+        self.unet.requires_grad_(True)
+        self.unet.train()
+        optimizer = torch.optim.Adam(
+            self.unet.parameters(),  # only optimize unet
+            lr=diffusion_model_learning_rate,
+        )
+        progress_bar = tqdm(range(model_fine_tuning_optimization_steps), disable=not accelerator.is_local_main_process)
+
+        logger.info("Next fine tuning the entire model to better reconstruct the init image")
+        for _ in range(model_fine_tuning_optimization_steps):
+            with accelerator.accumulate(self.unet.parameters()):
+                # Sample noise that we'll add to the latents
+                noise = torch.randn(init_image_latents.shape).to(init_image_latents.device)
+                timesteps = torch.randint(1000, (1,), device=init_image_latents.device)
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps)
+
+                # Predict the noise residual
+                noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample
+
+                loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()
+                accelerator.backward(loss)
+
+                optimizer.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+            logs = {"loss": loss.detach().item()}  # , "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+        accelerator.wait_for_everyone()
+        self.text_embeddings_orig = text_embeddings_orig
+        self.text_embeddings = text_embeddings
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        alpha: float = 1.2,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        generator: Optional[torch.Generator] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        guidance_scale: float = 7.5,
+        eta: float = 0.0,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `nd.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if self.text_embeddings is None:
+            raise ValueError("Please run the pipe.train() before trying to generate an image.")
+        if self.text_embeddings_orig is None:
+            raise ValueError("Please run the pipe.train() before trying to generate an image.")
+
+        text_embeddings = alpha * self.text_embeddings_orig + (1 - alpha) * self.text_embeddings
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens = [""]
+            max_length = self.tokenizer.model_max_length
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.view(1, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_shape = (1, self.unet.in_channels, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        if self.device.type == "mps":
+            # randn does not exist on mps
+            latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                self.device
+            )
+        else:
+            latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
+                self.device
+            )
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+            )
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

From 98c42134a5615e1c26f2cca70ff9a4c142850f65 Mon Sep 17 00:00:00 2001
From: MatthieuTPHR <75613333+MatthieuTPHR@users.noreply.github.com>
Date: Wed, 2 Nov 2022 10:29:06 +0100
Subject: [PATCH 099/269] Up to 2x speedup on GPUs using memory efficient
 attention (#532)

* 2x speedup using memory efficient attention

* remove einops dependency

* Swap K, M in op instantiation

* Simplify code, remove unnecessary maybe_init call and function, remove unused self.scale parameter

* make xformers a soft dependency

* remove one-liner functions

* change one letter variable to appropriate names

* Remove Env variable dependency, remove MemoryEfficientCrossAttention class and use enable_xformers_memory_efficient_attention method

* Add memory efficient attention toggle to img2img and inpaint pipelines

* Clearer management of xformers' availability

* update optimizations markdown to add info about memory efficient attention

* add benchmarks for TITAN RTX

* More detailed explanation of how the mem eff benchmark were ran

* Removing autocast from optimization markdown

* import_utils: import torch only if is available

Co-authored-by: Nouamane Tazi <nouamane98@gmail.com>
---
 docs/source/optimization/fp16.mdx             | 39 +++++++++++++
 src/diffusers/models/attention.py             | 55 +++++++++++++++++--
 src/diffusers/models/unet_2d_blocks.py        | 12 ++++
 src/diffusers/models/unet_2d_condition.py     | 11 ++++
 .../pipeline_stable_diffusion.py              | 18 ++++++
 .../pipeline_stable_diffusion_img2img.py      | 18 ++++++
 .../pipeline_stable_diffusion_inpaint.py      | 18 ++++++
 src/diffusers/utils/import_utils.py           | 16 ++++++
 8 files changed, 183 insertions(+), 4 deletions(-)

diff --git a/docs/source/optimization/fp16.mdx b/docs/source/optimization/fp16.mdx
index f12c067ba5ee..4371daacc903 100644
--- a/docs/source/optimization/fp16.mdx
+++ b/docs/source/optimization/fp16.mdx
@@ -22,6 +22,7 @@ We present some techniques and ideas to optimize 🤗 Diffusers _inference_ for
 | fp16             | 3.61s   | x2.63   |
 | channels last    | 3.30s   | x2.88   |
 | traced UNet      | 3.21s   | x2.96   |
+| memory efficient attention  | 2.63s  | x3.61   |
 
 <em>
   obtained on NVIDIA TITAN RTX by generating a single image of size 512x512 from
@@ -290,3 +291,41 @@ pipe.unet = TracedUNet()
 with torch.inference_mode():
     image = pipe([prompt] * 1, num_inference_steps=50).images[0]
 ```
+
+
+## Memory Efficient Attention
+Recent work on optimizing the bandwitdh in the attention block have generated huge speed ups and gains in GPU memory usage. The most recent being Flash Attention (from @tridao, [code](https://github.com/HazyResearch/flash-attention), [paper](https://arxiv.org/pdf/2205.14135.pdf)) .
+Here are the speedups we obtain on a few Nvidia GPUs when running the inference at 512x512 with a batch size of 1 (one prompt):
+
+| GPU              	| Base Attention FP16 	| Memory Efficient Attention FP16 	|
+|------------------	|---------------------	|---------------------------------	|
+| NVIDIA Tesla T4  	| 3.5it/s             	| 5.5it/s                         	|
+| NVIDIA 3060 RTX  	| 4.6it/s             	| 7.8it/s                         	|
+| NVIDIA A10G      	| 8.88it/s            	| 15.6it/s                        	|
+| NVIDIA RTX A6000 	| 11.7it/s            	| 21.09it/s                       	|
+| NVIDIA TITAN RTX  | 12.51it/s         	| 18.22it/s                       	|
+| A100-SXM4-40GB    	| 18.6it/s            	| 29.it/s                        	|
+| A100-SXM-80GB    	| 18.7it/s            	| 29.5it/s                        	|
+
+To leverage it just make sure you have: 
+ - PyTorch > 1.12
+ - Cuda available
+ - Installed the [xformers](https://github.com/facebookresearch/xformers) library
+```python
+from diffusers import StableDiffusionPipeline
+import torch
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    revision="fp16",
+    torch_dtype=torch.float16,
+).to("cuda")
+
+pipe.enable_xformers_memory_efficient_attention()
+
+with torch.inference_mode():
+    sample = pipe("a small cat")
+
+# optional: You can disable it via
+# pipe.disable_xformers_memory_efficient_attention()
+```
\ No newline at end of file
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index af441ef86181..1f9cf641c32d 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -18,6 +18,15 @@
 import torch.nn.functional as F
 from torch import nn
 
+from diffusers.utils.import_utils import is_xformers_available
+
+
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+
 
 class AttentionBlock(nn.Module):
     """
@@ -150,6 +159,10 @@ def _set_attention_slice(self, slice_size):
         for block in self.transformer_blocks:
             block._set_attention_slice(slice_size)
 
+    def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for block in self.transformer_blocks:
+            block._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
     def forward(self, hidden_states, context=None):
         # note: if no context is given, cross-attention defaults to self-attention
         batch, channel, height, weight = hidden_states.shape
@@ -206,6 +219,32 @@ def _set_attention_slice(self, slice_size):
         self.attn1._slice_size = slice_size
         self.attn2._slice_size = slice_size
 
+    def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        if not is_xformers_available():
+            print("Here is how to install it")
+            raise ModuleNotFoundError(
+                "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+                " xformers",
+                name="xformers",
+            )
+        elif not torch.cuda.is_available():
+            raise ValueError(
+                "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only"
+                " available for GPU "
+            )
+        else:
+            try:
+                # Make sure we can run the memory efficient attention
+                _ = xformers.ops.memory_efficient_attention(
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                )
+            except Exception as e:
+                raise e
+            self.attn1._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+            self.attn2._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+
     def forward(self, hidden_states, context=None):
         hidden_states = self.attn1(self.norm1(hidden_states)) + hidden_states
         hidden_states = self.attn2(self.norm2(hidden_states), context=context) + hidden_states
@@ -239,6 +278,7 @@ def __init__(
         # is split across the batch axis to save memory
         # You can set slice_size with `set_attention_slice`
         self._slice_size = None
+        self._use_memory_efficient_attention_xformers = False
 
         self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
         self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
@@ -279,11 +319,13 @@ def forward(self, hidden_states, context=None, mask=None):
         # TODO(PVP) - mask is currently never used. Remember to re-implement when used
 
         # attention, what we cannot get enough of
-
-        if self._slice_size is None or query.shape[0] // self._slice_size == 1:
-            hidden_states = self._attention(query, key, value)
+        if self._use_memory_efficient_attention_xformers:
+            hidden_states = self._memory_efficient_attention_xformers(query, key, value)
         else:
-            hidden_states = self._sliced_attention(query, key, value, sequence_length, dim)
+            if self._slice_size is None or query.shape[0] // self._slice_size == 1:
+                hidden_states = self._attention(query, key, value)
+            else:
+                hidden_states = self._sliced_attention(query, key, value, sequence_length, dim)
 
         # linear proj
         hidden_states = self.to_out[0](hidden_states)
@@ -341,6 +383,11 @@ def _sliced_attention(self, query, key, value, sequence_length, dim):
         hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
         return hidden_states
 
+    def _memory_efficient_attention_xformers(self, query, key, value):
+        hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=None)
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        return hidden_states
+
 
 class FeedForward(nn.Module):
     r"""
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index f4081c5c1cac..ae4fe2d8bba7 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -367,6 +367,10 @@ def set_attention_slice(self, slice_size):
         for attn in self.attentions:
             attn._set_attention_slice(slice_size)
 
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for attn in self.attentions:
+            attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
     def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
         hidden_states = self.resnets[0](hidden_states, temb)
         for attn, resnet in zip(self.attentions, self.resnets[1:]):
@@ -542,6 +546,10 @@ def set_attention_slice(self, slice_size):
         for attn in self.attentions:
             attn._set_attention_slice(slice_size)
 
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for attn in self.attentions:
+            attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
     def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
         output_states = ()
 
@@ -1117,6 +1125,10 @@ def set_attention_slice(self, slice_size):
 
         self.gradient_checkpointing = False
 
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for attn in self.attentions:
+            attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
     def forward(
         self,
         hidden_states,
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index d271b78a6525..7f7f3ecd4435 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -225,6 +225,17 @@ def set_attention_slice(self, slice_size):
             if hasattr(block, "attentions") and block.attentions is not None:
                 block.set_attention_slice(slice_size)
 
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for block in self.down_blocks:
+            if hasattr(block, "attentions") and block.attentions is not None:
+                block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
+        self.mid_block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
+        for block in self.up_blocks:
+            if hasattr(block, "attentions") and block.attentions is not None:
+                block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)):
             module.gradient_checkpointing = value
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 5927f36b12a1..3c1eb734a49d 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -113,6 +113,24 @@ def __init__(
             feature_extractor=feature_extractor,
         )
 
+    def enable_xformers_memory_efficient_attention(self):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(False)
+
     def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
         r"""
         Enable sliced attention computation.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 00c364f8e5e3..e61fb27acc1e 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -151,6 +151,24 @@ def disable_attention_slicing(self):
         # set slice_size = `None` to disable `set_attention_slice`
         self.enable_attention_slicing(None)
 
+    def enable_xformers_memory_efficient_attention(self):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(False)
+
     @torch.no_grad()
     def __call__(
         self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 57f9b65716ee..bbe6ee60832c 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -151,6 +151,24 @@ def disable_attention_slicing(self):
         # set slice_size = `None` to disable `attention slicing`
         self.enable_attention_slicing(None)
 
+    def enable_xformers_memory_efficient_attention(self):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(False)
+
     @torch.no_grad()
     def __call__(
         self,
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index 2a5f7f64dd07..4ea02dcc94da 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -168,6 +168,18 @@
 except importlib_metadata.PackageNotFoundError:
     _accelerate_available = False
 
+_xformers_available = importlib.util.find_spec("xformers") is not None
+try:
+    _xformers_version = importlib_metadata.version("xformers")
+    if _torch_available:
+        import torch
+
+        if torch.__version__ < version.Version("1.12"):
+            raise ValueError("PyTorch should be >= 1.12")
+    logger.debug(f"Successfully imported xformers version {_xformers_version}")
+except importlib_metadata.PackageNotFoundError:
+    _xformers_available = False
+
 
 def is_torch_available():
     return _torch_available
@@ -205,6 +217,10 @@ def is_scipy_available():
     return _scipy_available
 
 
+def is_xformers_available():
+    return _xformers_available
+
+
 def is_accelerate_available():
     return _accelerate_available
 

From 86087957117649524c671563e3e051eadfdad5e8 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Wed, 2 Nov 2022 11:32:46 +0100
Subject: [PATCH 100/269] [docs] add euler scheduler in docs, how to use
 differnet schedulers  (#1089)

* add euler scheduler in docs

* add a section for how to use different scheds

* address patrck's comments
---
 docs/source/api/pipelines/stable_diffusion.mdx | 15 +++++++++++++++
 docs/source/api/schedulers.mdx                 | 16 ++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/docs/source/api/pipelines/stable_diffusion.mdx b/docs/source/api/pipelines/stable_diffusion.mdx
index 46ffc3412e36..26d6a210adad 100644
--- a/docs/source/api/pipelines/stable_diffusion.mdx
+++ b/docs/source/api/pipelines/stable_diffusion.mdx
@@ -31,6 +31,21 @@ For more details about how Stable Diffusion works and how it differs from the ba
 
 ## Tips
 
+### How to load and use different schedulers.
+
+The stable diffusion pipeline uses [`PNDMScheduler`] scheduler by default. But `diffusers` provides many other schedulers that can be used with the stable diffusion pipeline such as [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`] etc.
+To use a different scheduler, you can pass the `scheduler` argument to `from_pretrained` method of the pipeline. For example, to use the [`EulerDiscreteScheduler`], you can do the following:
+
+```python
+from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
+
+euler_scheduler = EulerDiscreteScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
+pipeline = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=euler_scheduler)
+```
+
+
+### How to conver all use cases with multiple or single pipeline
+
 If you want to use all possible use cases in a single `DiffusionPipeline` you can either:
 - Make use of the [Stable Diffusion Mega Pipeline](https://github.com/huggingface/diffusers/tree/main/examples/community#stable-diffusion-mega) or 
 - Make use of the `components` functionality to instantiate all components in the most memory-efficient way:
diff --git a/docs/source/api/schedulers.mdx b/docs/source/api/schedulers.mdx
index 3f88e563de19..6616a3e51538 100644
--- a/docs/source/api/schedulers.mdx
+++ b/docs/source/api/schedulers.mdx
@@ -112,3 +112,19 @@ Score SDE-VP is under construction.
 </Tip>
 
 [[autodoc]] schedulers.scheduling_sde_vp.ScoreSdeVpScheduler
+
+
+#### Euler scheduler
+
+Euler scheduler (Algorithm 2) from the paper [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364) by Karras et al. (2022). Based on the original [k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L51) implementation by Katherine Crowson.
+Fast scheduler which often times generates good outputs with 20-30 steps.
+
+[[autodoc]] EulerDiscreteScheduler
+
+
+#### Euler Ancestral scheduler
+
+Ancestral sampling with Euler method steps. Based on the original (k-diffusion)[https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72] implementation by Katherine Crowson.
+Fast scheduler which often times generates good outputs with 20-30 steps.
+
+[[autodoc]] EulerAncestralDiscreteScheduler
\ No newline at end of file

From 8ee21915bfd5a81f1f6bb17dd8ac7c6ba5693ce4 Mon Sep 17 00:00:00 2001
From: Lewington-pitsos <lewingtonpitsos@gmail.com>
Date: Wed, 2 Nov 2022 21:47:26 +1100
Subject: [PATCH 101/269] Integration tests precision improvement for
 inpainting (#1052)

* improve test precision

get tests passing with greater precision using lewington images

* make old numpy load function a wrapper around a more flexible numpy loading function

* adhere to black formatting

* add more black formatting

* adhere to isort

* loosen precision and replace path

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/utils/__init__.py               |  1 +
 src/diffusers/utils/testing_utils.py          | 31 +++++++++++++++----
 tests/models/test_models_unet_2d.py           | 14 +++++++--
 tests/models/test_models_vae.py               |  4 +--
 .../test_stable_diffusion_inpaint.py          | 27 +++++++---------
 .../test_stable_diffusion_inpaint_legacy.py   | 20 ++++++------
 6 files changed, 60 insertions(+), 37 deletions(-)

diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index 12d731128385..7395f4edfa26 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -42,6 +42,7 @@
 if is_torch_available():
     from .testing_utils import (
         floats_tensor,
+        load_hf_numpy,
         load_image,
         load_numpy,
         parse_flag_from_env,
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index bd3b08d54a1c..bf398e5b6fe5 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -139,6 +139,29 @@ def require_onnxruntime(test_case):
     return unittest.skipUnless(is_onnx_available(), "test requires onnxruntime")(test_case)
 
 
+def load_numpy(arry: Union[str, np.ndarray]) -> np.ndarray:
+    if isinstance(arry, str):
+        if arry.startswith("http://") or arry.startswith("https://"):
+            response = requests.get(arry)
+            response.raise_for_status()
+            arry = np.load(BytesIO(response.content))
+        elif os.path.isfile(arry):
+            arry = np.load(arry)
+        else:
+            raise ValueError(
+                f"Incorrect path or url, URLs must start with `http://` or `https://`, and {arry} is not a valid path"
+            )
+    elif isinstance(arry, np.ndarray):
+        pass
+    else:
+        raise ValueError(
+            "Incorrect format used for numpy ndarray. Should be an url linking to an image, a local path, or a"
+            " ndarray."
+        )
+
+    return arry
+
+
 def load_image(image: Union[str, PIL.Image.Image]) -> PIL.Image.Image:
     """
     Args:
@@ -168,17 +191,13 @@ def load_image(image: Union[str, PIL.Image.Image]) -> PIL.Image.Image:
     return image
 
 
-def load_numpy(path) -> np.ndarray:
+def load_hf_numpy(path) -> np.ndarray:
     if not path.startswith("http://") or path.startswith("https://"):
         path = os.path.join(
             "https://huggingface.co/datasets/fusing/diffusers-testing/resolve/main", urllib.parse.quote(path)
         )
 
-    response = requests.get(path)
-    response.raise_for_status()
-    array = np.load(BytesIO(response.content))
-
-    return array
+    return load_numpy(path)
 
 
 # --- pytest conf functions --- #
diff --git a/tests/models/test_models_unet_2d.py b/tests/models/test_models_unet_2d.py
index 548588918c88..20371708a4d8 100644
--- a/tests/models/test_models_unet_2d.py
+++ b/tests/models/test_models_unet_2d.py
@@ -21,7 +21,15 @@
 import torch
 
 from diffusers import UNet2DConditionModel, UNet2DModel
-from diffusers.utils import floats_tensor, load_numpy, logging, require_torch_gpu, slow, torch_all_close, torch_device
+from diffusers.utils import (
+    floats_tensor,
+    load_hf_numpy,
+    logging,
+    require_torch_gpu,
+    slow,
+    torch_all_close,
+    torch_device,
+)
 from parameterized import parameterized
 
 from ..test_modeling_common import ModelTesterMixin
@@ -423,7 +431,7 @@ def tearDown(self):
 
     def get_latents(self, seed=0, shape=(4, 4, 64, 64), fp16=False):
         dtype = torch.float16 if fp16 else torch.float32
-        image = torch.from_numpy(load_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
+        image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
         return image
 
     def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"):
@@ -439,7 +447,7 @@ def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"):
 
     def get_encoder_hidden_states(self, seed=0, shape=(4, 77, 768), fp16=False):
         dtype = torch.float16 if fp16 else torch.float32
-        hidden_states = torch.from_numpy(load_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
+        hidden_states = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
         return hidden_states
 
     @parameterized.expand(
diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py
index f6333d6cd906..3da7b50e34f3 100644
--- a/tests/models/test_models_vae.py
+++ b/tests/models/test_models_vae.py
@@ -20,7 +20,7 @@
 
 from diffusers import AutoencoderKL
 from diffusers.modeling_utils import ModelMixin
-from diffusers.utils import floats_tensor, load_numpy, require_torch_gpu, slow, torch_all_close, torch_device
+from diffusers.utils import floats_tensor, load_hf_numpy, require_torch_gpu, slow, torch_all_close, torch_device
 from parameterized import parameterized
 
 from ..test_modeling_common import ModelTesterMixin
@@ -147,7 +147,7 @@ def tearDown(self):
 
     def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
         dtype = torch.float16 if fp16 else torch.float32
-        image = torch.from_numpy(load_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
+        image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
         return image
 
     def get_sd_vae_model(self, model_id="CompVis/stable-diffusion-v1-4", fp16=False):
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 0a373ada68bc..f5a8b3cf9ecc 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -28,7 +28,7 @@
     UNet2DModel,
     VQModel,
 )
-from diffusers.utils import floats_tensor, load_image, slow, torch_device
+from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
 from PIL import Image
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
@@ -278,11 +278,10 @@ def test_stable_diffusion_inpaint_pipeline(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
         )
-        expected_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/yellow_cat_sitting_on_a_park_bench.png"
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint"
+            "/yellow_cat_sitting_on_a_park_bench.npy"
         )
-        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
 
         model_id = "runwayml/stable-diffusion-inpainting"
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
@@ -307,7 +306,7 @@ def test_stable_diffusion_inpaint_pipeline(self):
         image = output.images[0]
 
         assert image.shape == (512, 512, 3)
-        assert np.abs(expected_image - image).max() < 1e-2
+        assert np.abs(expected_image - image).max() < 1e-3
 
     def test_stable_diffusion_inpaint_pipeline_fp16(self):
         init_image = load_image(
@@ -318,11 +317,10 @@ def test_stable_diffusion_inpaint_pipeline_fp16(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
         )
-        expected_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/yellow_cat_sitting_on_a_park_bench_fp16.png"
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint"
+            "/yellow_cat_sitting_on_a_park_bench_fp16.npy"
         )
-        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
 
         model_id = "runwayml/stable-diffusion-inpainting"
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
@@ -360,11 +358,10 @@ def test_stable_diffusion_inpaint_pipeline_pndm(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
         )
-        expected_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/yellow_cat_sitting_on_a_park_bench_pndm.png"
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint"
+            "/yellow_cat_sitting_on_a_park_bench_pndm.npy"
         )
-        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
 
         model_id = "runwayml/stable-diffusion-inpainting"
         pndm = PNDMScheduler.from_config(model_id, subfolder="scheduler")
@@ -388,4 +385,4 @@ def test_stable_diffusion_inpaint_pipeline_pndm(self):
         image = output.images[0]
 
         assert image.shape == (512, 512, 3)
-        assert np.abs(expected_image - image).max() < 1e-2
+        assert np.abs(expected_image - image).max() < 1e-3
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
index d25342a35aea..81deba67f274 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
@@ -31,7 +31,7 @@
     VQModel,
 )
 from diffusers.utils import floats_tensor, load_image, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import load_numpy, require_torch_gpu
 from PIL import Image
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
@@ -358,11 +358,10 @@ def test_stable_diffusion_inpaint_legacy_pipeline(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
         )
-        expected_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/red_cat_sitting_on_a_park_bench.png"
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint"
+            "/red_cat_sitting_on_a_park_bench.npy"
         )
-        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
 
         model_id = "CompVis/stable-diffusion-v1-4"
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
@@ -389,7 +388,7 @@ def test_stable_diffusion_inpaint_legacy_pipeline(self):
         image = output.images[0]
 
         assert image.shape == (512, 512, 3)
-        assert np.abs(expected_image - image).max() < 1e-2
+        assert np.abs(expected_image - image).max() < 1e-3
 
     def test_stable_diffusion_inpaint_legacy_pipeline_k_lms(self):
         # TODO(Anton, Patrick) - I think we can remove this test soon
@@ -401,11 +400,10 @@ def test_stable_diffusion_inpaint_legacy_pipeline_k_lms(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
         )
-        expected_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/red_cat_sitting_on_a_park_bench_k_lms.png"
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint"
+            "/red_cat_sitting_on_a_park_bench_k_lms.npy"
         )
-        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
 
         model_id = "CompVis/stable-diffusion-v1-4"
         lms = LMSDiscreteScheduler.from_config(model_id, subfolder="scheduler")
@@ -434,7 +432,7 @@ def test_stable_diffusion_inpaint_legacy_pipeline_k_lms(self):
         image = output.images[0]
 
         assert image.shape == (512, 512, 3)
-        assert np.abs(expected_image - image).max() < 1e-2
+        assert np.abs(expected_image - image).max() < 1e-3
 
     def test_stable_diffusion_inpaint_legacy_intermediate_state(self):
         number_of_steps = 0

From bdbcaa9852565d3854100f6504266ffcc7405b05 Mon Sep 17 00:00:00 2001
From: rafael <rafael.irgolic@gmail.com>
Date: Wed, 2 Nov 2022 06:51:18 -0400
Subject: [PATCH 102/269] lpw_stable_diffusion: Add is_cancelled_callback
 (#1053)

* [Community Pipelines] lpw_stable_diffusion: Add is_cancelled_callback

* [Community pipelines] lpw_stable_diffusion_onnx: Add is_cancelled_callback
---
 examples/community/lpw_stable_diffusion.py      | 12 ++++++++++--
 examples/community/lpw_stable_diffusion_onnx.py | 12 ++++++++++--
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py
index 1b2279de720c..74aed2fec86f 100644
--- a/examples/community/lpw_stable_diffusion.py
+++ b/examples/community/lpw_stable_diffusion.py
@@ -498,6 +498,7 @@ def __call__(
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
         callback_steps: Optional[int] = 1,
         **kwargs,
     ):
@@ -560,11 +561,15 @@ def __call__(
             callback (`Callable`, *optional*):
                 A function that will be called every `callback_steps` steps during inference. The function will be
                 called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            is_cancelled_callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. If the function returns
+                `True`, the inference will be cancelled.
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
 
         Returns:
+            `None` if cancelled by `is_cancelled_callback`,
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
             When returning a tuple, the first element is a list with the generated images, and the second element is a
@@ -757,8 +762,11 @@ def __call__(
                 latents = (init_latents_proper * mask) + (latents * (1 - mask))
 
             # call the callback, if provided
-            if callback is not None and i % callback_steps == 0:
-                callback(i, t, latents)
+            if i % callback_steps == 0:
+                if callback is not None:
+                    callback(i, t, latents)
+                if is_cancelled_callback is not None and is_cancelled_callback():
+                    return None
 
         latents = 1 / 0.18215 * latents
         image = self.vae.decode(latents).sample
diff --git a/examples/community/lpw_stable_diffusion_onnx.py b/examples/community/lpw_stable_diffusion_onnx.py
index 37f03c86f29d..69b942f9ef1a 100644
--- a/examples/community/lpw_stable_diffusion_onnx.py
+++ b/examples/community/lpw_stable_diffusion_onnx.py
@@ -435,6 +435,7 @@ def __call__(
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
         callback_steps: Optional[int] = 1,
         **kwargs,
     ):
@@ -496,11 +497,15 @@ def __call__(
             callback (`Callable`, *optional*):
                 A function that will be called every `callback_steps` steps during inference. The function will be
                 called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            is_cancelled_callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. If the function returns
+                `True`, the inference will be cancelled.
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
 
         Returns:
+            `None` if cancelled by `is_cancelled_callback`,
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
             When returning a tuple, the first element is a list with the generated images, and the second element is a
@@ -668,8 +673,11 @@ def __call__(
                 latents = (init_latents_proper * mask) + (latents * (1 - mask))
 
             # call the callback, if provided
-            if callback is not None and i % callback_steps == 0:
-                callback(i, t, latents)
+            if i % callback_steps == 0:
+                if callback is not None:
+                    callback(i, t, latents)
+                if is_cancelled_callback is not None and is_cancelled_callback():
+                    return None
 
         latents = 1 / 0.18215 * latents
         # image = self.vae_decoder(latent_sample=latents)[0]

From d53ffbbdf448eddb35d0e73868053be943b1e17a Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 2 Nov 2022 11:59:00 +0100
Subject: [PATCH 103/269] Rename latent (#1102)

* Rename latent

* uP
---
 docs/source/api/pipelines/overview.mdx | 2 +-
 src/diffusers/pipelines/README.md      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/api/pipelines/overview.mdx b/docs/source/api/pipelines/overview.mdx
index af711a02d9f3..9bb351d93c8d 100644
--- a/docs/source/api/pipelines/overview.mdx
+++ b/docs/source/api/pipelines/overview.mdx
@@ -28,7 +28,7 @@ or created independently from each other.
 
 To that end, we strive to offer all open-sourced, state-of-the-art diffusion system under a unified API. 
 More specifically, we strive to provide pipelines that
-- 1. can load the officially published weights and yield 1-to-1 the same outputs as the original implementation according to the corresponding paper (*e.g.* [LatentDiffusionPipeline](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/latent_diffusion), uses the officially released weights of [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)),
+- 1. can load the officially published weights and yield 1-to-1 the same outputs as the original implementation according to the corresponding paper (*e.g.* [LDMTextToImagePipeline](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/latent_diffusion), uses the officially released weights of [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)),
 - 2. have a simple user interface to run the model in inference (see the [Pipelines API](#pipelines-api) section), 
 - 3. are easy to understand with code that is self-explanatory and can be read along-side the official paper (see [Pipelines summary](#pipelines-summary)),
 - 4. can easily be contributed by the community (see the [Contribution](#contribution) section).
diff --git a/src/diffusers/pipelines/README.md b/src/diffusers/pipelines/README.md
index 86048eb5a052..2941660fa235 100644
--- a/src/diffusers/pipelines/README.md
+++ b/src/diffusers/pipelines/README.md
@@ -16,7 +16,7 @@ or created independently from each other.
 
 To that end, we strive to offer all open-sourced, state-of-the-art diffusion system under a unified API. 
 More specifically, we strive to provide pipelines that
-- 1. can load the officially published weights and yield 1-to-1 the same outputs as the original implementation according to the corresponding paper (*e.g.* [LatentDiffusionPipeline](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/latent_diffusion), uses the officially released weights of [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)),
+- 1. can load the officially published weights and yield 1-to-1 the same outputs as the original implementation according to the corresponding paper (*e.g.* [LDMTextToImagePipeline](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/latent_diffusion), uses the officially released weights of [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)),
 - 2. have a simple user interface to run the model in inference (see the [Pipelines API](#pipelines-api) section), 
 - 3. are easy to understand with code that is self-explanatory and can be read along-side the official paper (see [Pipelines summary](#pipelines-summary)),
 - 4. can easily be contributed by the community (see the [Contribution](#contribution) section).

From 0025626cd9feca260e05f295f272af4f7b9ce44a Mon Sep 17 00:00:00 2001
From: Jonathan Rahn <rahn.jonathan@gmail.com>
Date: Wed, 2 Nov 2022 13:15:30 +0100
Subject: [PATCH 104/269] fix typo in examples dreambooth README.md (#1073)

Update README.md

fixed typo
---
 examples/dreambooth/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/dreambooth/README.md b/examples/dreambooth/README.md
index 5b091aece3ce..3c9d04abc2d2 100644
--- a/examples/dreambooth/README.md
+++ b/examples/dreambooth/README.md
@@ -185,7 +185,7 @@ accelerate launch train_dreambooth.py \
   --class_prompt="a photo of dog" \
   --resolution=512 \
   --train_batch_size=1 \
-  --use_8bit_adam
+  --use_8bit_adam \
   --gradient_checkpointing \
   --learning_rate=2e-6 \
   --lr_scheduler="constant" \
@@ -291,4 +291,4 @@ python train_dreambooth_flax.py \
   --learning_rate=2e-6 \
   --num_class_images=200 \
   --max_train_steps=800
-```
\ No newline at end of file
+```

From b1ec61ee45e14403f6f5e81a5097a50e8f5385a7 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Wed, 2 Nov 2022 14:02:52 +0100
Subject: [PATCH 105/269] fix model card url in text inversion readme. (#1103)

Update README.md
---
 examples/textual_inversion/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/textual_inversion/README.md b/examples/textual_inversion/README.md
index 9f3aa671456e..2edf34cb490d 100644
--- a/examples/textual_inversion/README.md
+++ b/examples/textual_inversion/README.md
@@ -29,7 +29,7 @@ accelerate config
 
 ### Cat toy example
 
-You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. 
+You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-5`, so you'll need to visit [its card](https://huggingface.co/runwayml/stable-diffusion-v1-5), read the license and tick the checkbox if you agree. 
 
 You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
 
@@ -111,4 +111,4 @@ python textual_inversion_flax.py \
   --learning_rate=5.0e-04 --scale_lr \
   --output_dir="textual_inversion_cat"
 ```
-It should be at least 70% faster than the PyTorch script with the same configuration.
\ No newline at end of file
+It should be at least 70% faster than the PyTorch script with the same configuration.

From 4e59bcc680ba4f68bc8d45249db5fe7a078413db Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Wed, 2 Nov 2022 14:07:07 +0100
Subject: [PATCH 106/269] [CI] Framework and hardware-specific CI tests (#997)

* [WIP][CI] Framework and hardware-specific docker images for CI tests

* username

* fix cpu

* try out the image

* push latest

* update workspace

* no root isolation for actions

* add a flax image

* flax and onnx matrix

* fix runners

* add reports

* onnxruntime image

* retry tpu

* fix

* fix

* build onnxruntime

* naming

* onnxruntime-gpu image

* onnxruntime-gpu image, slow tests

* latest jax version

* trigger flax

* run flax tests in one thread

* fast flax tests on cpu

* fast flax tests on cpu

* trigger slow tests

* rebuild torch cuda

* force cuda provider

* fix onnxruntime tests

* trigger slow

* don't specify gpu for tpu

* optimize

* memory limit

* fix flax tests

* disable docker cache
---
 .github/workflows/build_docker_images.yml     | 50 ++++++++++
 .github/workflows/pr_tests.yml                | 78 +++++++++++++---
 .github/workflows/push_tests.yml              | 91 ++++++++++++++-----
 docker/diffusers-flax-cpu/Dockerfile          | 42 +++++++++
 docker/diffusers-flax-tpu/Dockerfile          | 44 +++++++++
 docker/diffusers-onnxruntime-cpu/Dockerfile   | 42 +++++++++
 docker/diffusers-onnxruntime-cuda/Dockerfile  | 42 +++++++++
 docker/diffusers-pytorch-cpu/Dockerfile       | 41 +++++++++
 docker/diffusers-pytorch-cuda/Dockerfile      | 41 +++++++++
 setup.py                                      |  6 +-
 src/diffusers/dependency_versions_table.py    |  5 +-
 .../test_onnx_stable_diffusion.py             | 23 ++++-
 .../test_onnx_stable_diffusion_img2img.py     | 21 ++++-
 .../test_onnx_stable_diffusion_inpaint.py     | 22 ++++-
 tests/test_pipelines_flax.py                  |  6 +-
 tests/test_scheduler_flax.py                  | 53 ++++++++---
 16 files changed, 540 insertions(+), 67 deletions(-)
 create mode 100644 .github/workflows/build_docker_images.yml
 create mode 100644 docker/diffusers-flax-cpu/Dockerfile
 create mode 100644 docker/diffusers-flax-tpu/Dockerfile
 create mode 100644 docker/diffusers-onnxruntime-cpu/Dockerfile
 create mode 100644 docker/diffusers-onnxruntime-cuda/Dockerfile
 create mode 100644 docker/diffusers-pytorch-cpu/Dockerfile
 create mode 100644 docker/diffusers-pytorch-cuda/Dockerfile

diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml
new file mode 100644
index 000000000000..ff4bd66fdde5
--- /dev/null
+++ b/.github/workflows/build_docker_images.yml
@@ -0,0 +1,50 @@
+name: Build Docker images (nightly)
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *" # every day at midnight
+
+concurrency:
+  group: docker-image-builds
+  cancel-in-progress: false
+
+env:
+  REGISTRY: diffusers
+
+jobs:
+  build-docker-images:
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: read
+      packages: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        image-name:
+          - diffusers-pytorch-cpu
+          - diffusers-pytorch-cuda
+          - diffusers-flax-cpu
+          - diffusers-flax-tpu
+          - diffusers-onnxruntime-cpu
+          - diffusers-onnxruntime-cuda
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ env.REGISTRY }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v3
+        with:
+          no-cache: true
+          context: ./docker/${{ matrix.image-name }}
+          push: true
+          tags: ${{ env.REGISTRY }}/${{ matrix.image-name }}:latest
diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
index 163e24136f51..242e9552d934 100644
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -11,19 +11,45 @@ concurrency:
 
 env:
   DIFFUSERS_IS_CI: yes
-  OMP_NUM_THREADS: 8
-  MKL_NUM_THREADS: 8
+  OMP_NUM_THREADS: 4
+  MKL_NUM_THREADS: 4
   PYTEST_TIMEOUT: 60
   MPS_TORCH_VERSION: 1.13.0
 
 jobs:
-  run_tests_cpu:
-    name: CPU tests on Ubuntu
-    runs-on: [ self-hosted, docker-gpu ]
+  run_fast_tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - name: Fast PyTorch CPU tests on Ubuntu
+            framework: pytorch
+            runner: docker-cpu
+            image: diffusers/diffusers-pytorch-cpu
+            report: torch_cpu
+          - name: Fast Flax CPU tests on Ubuntu
+            framework: flax
+            runner: docker-cpu
+            image: diffusers/diffusers-flax-cpu
+            report: flax_cpu
+          - name: Fast ONNXRuntime CPU tests on Ubuntu
+            framework: onnxruntime
+            runner: docker-cpu
+            image: diffusers/diffusers-onnxruntime-cpu
+            report: onnx_cpu
+
+    name: ${{ matrix.config.name }}
+
+    runs-on: ${{ matrix.config.runner }}
+
     container:
-      image: python:3.7
+      image: ${{ matrix.config.image }}
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
 
+    defaults:
+      run:
+        shell: bash
+
     steps:
     - name: Checkout diffusers
       uses: actions/checkout@v3
@@ -32,8 +58,6 @@ jobs:
 
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
-        python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
         python -m pip install -e .[quality,test]
         python -m pip install git+https://github.com/huggingface/accelerate
 
@@ -41,25 +65,49 @@ jobs:
       run: |
         python utils/print_env.py
 
-    - name: Run all fast tests on CPU
+    - name: Run fast PyTorch CPU tests
+      if: ${{ matrix.config.framework == 'pytorch' }}
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      run: |
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "not Flax and not Onnx" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/
+
+    - name: Run fast Flax TPU tests
+      if: ${{ matrix.config.framework == 'flax' }}
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      run: |
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "Flax" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/
+
+    - name: Run fast ONNXRuntime CPU tests
+      if: ${{ matrix.config.framework == 'onnxruntime' }}
       env:
         HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       run: |
-        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=tests_torch_cpu tests/
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "Onnx" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/
 
     - name: Failure short reports
       if: ${{ failure() }}
-      run: cat reports/tests_torch_cpu_failures_short.txt
+      run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
 
     - name: Test suite reports artifacts
       if: ${{ always() }}
       uses: actions/upload-artifact@v2
       with:
-        name: pr_torch_cpu_test_reports
+        name: pr_${{ matrix.config.report }}_test_reports
         path: reports
 
-  run_tests_apple_m1:
-    name: MPS tests on Apple M1
+  run_fast_tests_apple_m1:
+    name: Fast PyTorch MPS tests on MacOS
     runs-on: [ self-hosted, apple-m1 ]
 
     steps:
@@ -91,7 +139,7 @@ jobs:
       run: |
         ${CONDA_RUN} python utils/print_env.py
 
-    - name: Run all fast tests on MPS
+    - name: Run fast PyTorch tests on M1 (MPS)
       shell: arch -arch arm64 bash {0}
       env:
         HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index 397fecb560f0..2beb05e8eaca 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -14,12 +14,38 @@ env:
   RUN_SLOW: yes
 
 jobs:
-  run_tests_single_gpu:
-    name: Diffusers tests
-    runs-on: [ self-hosted, docker-gpu, single-gpu ]
+  run_slow_tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - name: Slow PyTorch CUDA tests on Ubuntu
+            framework: pytorch
+            runner: docker-gpu
+            image: diffusers/diffusers-pytorch-cuda
+            report: torch_cuda
+          - name: Slow Flax TPU tests on Ubuntu
+            framework: flax
+            runner: docker-tpu
+            image: diffusers/diffusers-flax-tpu
+            report: flax_tpu
+          - name: Slow ONNXRuntime CUDA tests on Ubuntu
+            framework: onnxruntime
+            runner: docker-gpu
+            image: diffusers/diffusers-onnxruntime-cuda
+            report: onnx_cuda
+
+    name: ${{ matrix.config.name }}
+
+    runs-on: ${{ matrix.config.runner }}
+
     container:
-      image: nvcr.io/nvidia/pytorch:22.07-py3
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache
+      image: ${{ matrix.config.image }}
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ ${{ matrix.config.runner == 'docker-tpu' && '--privileged' || '--gpus 0'}}
+
+    defaults:
+      run:
+        shell: bash
 
     steps:
     - name: Checkout diffusers
@@ -28,14 +54,12 @@ jobs:
         fetch-depth: 2
 
     - name: NVIDIA-SMI
+      if : ${{ matrix.config.runner == 'docker-gpu' }}
       run: |
         nvidia-smi
 
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
-        python -m pip uninstall -y torch torchvision torchtext
-        python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu117
         python -m pip install -e .[quality,test]
         python -m pip install git+https://github.com/huggingface/accelerate
 
@@ -43,29 +67,55 @@ jobs:
       run: |
         python utils/print_env.py
 
-    - name: Run all (incl. slow) tests on GPU
+    - name: Run slow PyTorch CUDA tests
+      if: ${{ matrix.config.framework == 'pytorch' }}
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "not Flax and not Onnx" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/
+
+    - name: Run slow Flax TPU tests
+      if: ${{ matrix.config.framework == 'flax' }}
       env:
         HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=tests_torch_gpu tests/
+        python -m pytest -n 0 \
+          -s -v -k "Flax" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/
+
+    - name: Run slow ONNXRuntime CUDA tests
+      if: ${{ matrix.config.framework == 'onnxruntime' }}
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "Onnx" \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/
 
     - name: Failure short reports
       if: ${{ failure() }}
-      run: cat reports/tests_torch_gpu_failures_short.txt
+      run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
 
     - name: Test suite reports artifacts
       if: ${{ always() }}
       uses: actions/upload-artifact@v2
       with:
-        name: torch_test_reports
+        name: ${{ matrix.config.report }}_test_reports
         path: reports
 
-  run_examples_single_gpu:
-    name: Examples tests
-    runs-on: [ self-hosted, docker-gpu, single-gpu ]
+  run_examples_tests:
+    name: Examples PyTorch CUDA tests on Ubuntu
+
+    runs-on: docker-gpu
+
     container:
-      image: nvcr.io/nvidia/pytorch:22.07-py3
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache
+      image: diffusers/diffusers-pytorch-cuda
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
 
     steps:
     - name: Checkout diffusers
@@ -79,9 +129,6 @@ jobs:
 
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
-        python -m pip uninstall -y torch torchvision torchtext
-        python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu117
         python -m pip install -e .[quality,test,training]
         python -m pip install git+https://github.com/huggingface/accelerate
 
@@ -93,11 +140,11 @@ jobs:
       env:
         HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       run: |
-        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_gpu examples/
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
 
     - name: Failure short reports
       if: ${{ failure() }}
-      run: cat reports/examples_torch_gpu_failures_short.txt
+      run: cat reports/examples_torch_cuda_failures_short.txt
 
     - name: Test suite reports artifacts
       if: ${{ always() }}
diff --git a/docker/diffusers-flax-cpu/Dockerfile b/docker/diffusers-flax-cpu/Dockerfile
new file mode 100644
index 000000000000..a4b4ccd65b39
--- /dev/null
+++ b/docker/diffusers-flax-cpu/Dockerfile
@@ -0,0 +1,42 @@
+FROM ubuntu:20.04
+LABEL maintainer="Hugging Face"
+LABEL repository="diffusers"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   git-lfs \
+                   curl \
+                   ca-certificates \
+                   python3.8 \
+                   python3-pip \
+                   python3.8-venv && \
+    rm -rf /var/lib/apt/lists
+
+# make sure to use venv
+RUN python3 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
+# follow the instructions here: https://cloud.google.com/tpu/docs/run-in-container#train_a_jax_model_in_a_docker_container
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --upgrade --no-cache-dir \
+        clu \
+        "jax[cpu]>=0.2.16,!=0.3.2" \
+        "flax>=0.4.1" \
+        "jaxlib>=0.1.65" && \
+    python3 -m pip install --no-cache-dir \
+        accelerate \
+        datasets \
+        hf-doc-builder \
+        huggingface-hub \
+        modelcards \
+        numpy \
+        scipy \
+        tensorboard \
+        transformers
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/diffusers-flax-tpu/Dockerfile b/docker/diffusers-flax-tpu/Dockerfile
new file mode 100644
index 000000000000..5508af6622dd
--- /dev/null
+++ b/docker/diffusers-flax-tpu/Dockerfile
@@ -0,0 +1,44 @@
+FROM ubuntu:20.04
+LABEL maintainer="Hugging Face"
+LABEL repository="diffusers"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   git-lfs \
+                   curl \
+                   ca-certificates \
+                   python3.8 \
+                   python3-pip \
+                   python3.8-venv && \
+    rm -rf /var/lib/apt/lists
+
+# make sure to use venv
+RUN python3 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
+# follow the instructions here: https://cloud.google.com/tpu/docs/run-in-container#train_a_jax_model_in_a_docker_container
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+        "jax[tpu]>=0.2.16,!=0.3.2" \
+        -f https://storage.googleapis.com/jax-releases/libtpu_releases.html && \
+    python3 -m pip install --upgrade --no-cache-dir \
+        clu \
+        "flax>=0.4.1" \
+        "jaxlib>=0.1.65" && \
+    python3 -m pip install --no-cache-dir \
+        accelerate \
+        datasets \
+        hf-doc-builder \
+        huggingface-hub \
+        modelcards \
+        numpy \
+        scipy \
+        tensorboard \
+        transformers
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/diffusers-onnxruntime-cpu/Dockerfile b/docker/diffusers-onnxruntime-cpu/Dockerfile
new file mode 100644
index 000000000000..c925715915cd
--- /dev/null
+++ b/docker/diffusers-onnxruntime-cpu/Dockerfile
@@ -0,0 +1,42 @@
+FROM ubuntu:20.04
+LABEL maintainer="Hugging Face"
+LABEL repository="diffusers"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   git-lfs \
+                   curl \
+                   ca-certificates \
+                   python3.8 \
+                   python3-pip \
+                   python3.8-venv && \
+    rm -rf /var/lib/apt/lists
+
+# make sure to use venv
+RUN python3 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+        torch \
+        torchvision \
+        torchaudio \
+        onnxruntime \
+        --extra-index-url https://download.pytorch.org/whl/cpu && \
+    python3 -m pip install --no-cache-dir \
+        accelerate \
+        datasets \
+        hf-doc-builder \
+        huggingface-hub \
+        modelcards \
+        numpy \
+        scipy \
+        tensorboard \
+        transformers
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/diffusers-onnxruntime-cuda/Dockerfile b/docker/diffusers-onnxruntime-cuda/Dockerfile
new file mode 100644
index 000000000000..e51a5e0ba30f
--- /dev/null
+++ b/docker/diffusers-onnxruntime-cuda/Dockerfile
@@ -0,0 +1,42 @@
+FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
+LABEL maintainer="Hugging Face"
+LABEL repository="diffusers"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   git-lfs \
+                   curl \
+                   ca-certificates \
+                   python3.8 \
+                   python3-pip \
+                   python3.8-venv && \
+    rm -rf /var/lib/apt/lists
+
+# make sure to use venv
+RUN python3 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+        torch \
+        torchvision \
+        torchaudio \
+        "onnxruntime-gpu>=1.13.1" \
+        --extra-index-url https://download.pytorch.org/whl/cu117 && \
+    python3 -m pip install --no-cache-dir \
+        accelerate \
+        datasets \
+        hf-doc-builder \
+        huggingface-hub \
+        modelcards \
+        numpy \
+        scipy \
+        tensorboard \
+        transformers
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/diffusers-pytorch-cpu/Dockerfile b/docker/diffusers-pytorch-cpu/Dockerfile
new file mode 100644
index 000000000000..41d1672f60e6
--- /dev/null
+++ b/docker/diffusers-pytorch-cpu/Dockerfile
@@ -0,0 +1,41 @@
+FROM ubuntu:20.04
+LABEL maintainer="Hugging Face"
+LABEL repository="diffusers"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   git-lfs \
+                   curl \
+                   ca-certificates \
+                   python3.8 \
+                   python3-pip \
+                   python3.8-venv && \
+    rm -rf /var/lib/apt/lists
+
+# make sure to use venv
+RUN python3 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+        torch \
+        torchvision \
+        torchaudio \
+        --extra-index-url https://download.pytorch.org/whl/cpu && \
+    python3 -m pip install --no-cache-dir \
+        accelerate \
+        datasets \
+        hf-doc-builder \
+        huggingface-hub \
+        modelcards \
+        numpy \
+        scipy \
+        tensorboard \
+        transformers
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/diffusers-pytorch-cuda/Dockerfile b/docker/diffusers-pytorch-cuda/Dockerfile
new file mode 100644
index 000000000000..ba80395c89f4
--- /dev/null
+++ b/docker/diffusers-pytorch-cuda/Dockerfile
@@ -0,0 +1,41 @@
+FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu20.04
+LABEL maintainer="Hugging Face"
+LABEL repository="diffusers"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   git-lfs \
+                   curl \
+                   ca-certificates \
+                   python3.8 \
+                   python3-pip \
+                   python3.8-venv && \
+    rm -rf /var/lib/apt/lists
+
+# make sure to use venv
+RUN python3 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+        torch \
+        torchvision \
+        torchaudio \
+        --extra-index-url https://download.pytorch.org/whl/cu117 && \
+    python3 -m pip install --no-cache-dir \
+        accelerate \
+        datasets \
+        hf-doc-builder \
+        huggingface-hub \
+        modelcards \
+        numpy \
+        scipy \
+        tensorboard \
+        transformers
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 6f0742e83ee5..8904242a3158 100644
--- a/setup.py
+++ b/setup.py
@@ -89,11 +89,10 @@
     "huggingface-hub>=0.10.0",
     "importlib_metadata",
     "isort>=5.5.4",
-    "jax>=0.2.8,!=0.3.2,<=0.3.6",
-    "jaxlib>=0.1.65,<=0.3.6",
+    "jax>=0.2.8,!=0.3.2",
+    "jaxlib>=0.1.65",
     "modelcards>=0.1.4",
     "numpy",
-    "onnxruntime",
     "parameterized",
     "pytest",
     "pytest-timeout",
@@ -181,7 +180,6 @@ def run(self):
 extras["test"] = deps_list(
     "accelerate",
     "datasets",
-    "onnxruntime",
     "parameterized",
     "pytest",
     "pytest-timeout",
diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py
index 64e55e932c1c..59e13da0f22b 100644
--- a/src/diffusers/dependency_versions_table.py
+++ b/src/diffusers/dependency_versions_table.py
@@ -13,11 +13,10 @@
     "huggingface-hub": "huggingface-hub>=0.10.0",
     "importlib_metadata": "importlib_metadata",
     "isort": "isort>=5.5.4",
-    "jax": "jax>=0.2.8,!=0.3.2,<=0.3.6",
-    "jaxlib": "jaxlib>=0.1.65,<=0.3.6",
+    "jax": "jax>=0.2.8,!=0.3.2",
+    "jaxlib": "jaxlib>=0.1.65",
     "modelcards": "modelcards>=0.1.4",
     "numpy": "numpy",
-    "onnxruntime": "onnxruntime",
     "parameterized": "parameterized",
     "pytest": "pytest",
     "pytest-timeout": "pytest-timeout",
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
index 1275b7f980c1..d8356675e9b3 100644
--- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
@@ -18,11 +18,15 @@
 import numpy as np
 
 from diffusers import OnnxStableDiffusionPipeline
-from diffusers.utils.testing_utils import require_onnxruntime, slow
+from diffusers.utils.testing_utils import is_onnx_available, require_onnxruntime, require_torch_gpu, slow
 
 from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
 
 
+if is_onnx_available():
+    import onnxruntime as ort
+
+
 class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase):
     # FIXME: add fast tests
     pass
@@ -30,10 +34,23 @@ class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.Tes
 
 @slow
 @require_onnxruntime
+@require_torch_gpu
 class OnnxStableDiffusionPipelineIntegrationTests(unittest.TestCase):
     def test_inference(self):
+        provider = (
+            "CUDAExecutionProvider",
+            {
+                "gpu_mem_limit": "17179869184",  # 16GB.
+                "arena_extend_strategy": "kSameAsRequested",
+            },
+        )
+        options = ort.SessionOptions()
+        options.enable_mem_pattern = False
         sd_pipe = OnnxStableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="onnx", provider="CPUExecutionProvider"
+            "CompVis/stable-diffusion-v1-4",
+            revision="onnx",
+            provider=provider,
+            sess_options=options,
         )
 
         prompt = "A painting of a squirrel eating a burger"
@@ -72,7 +89,7 @@ def test_callback_fn(step: int, timestep: int, latents: np.ndarray) -> None:
         test_callback_fn.has_been_called = False
 
         pipe = OnnxStableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="onnx", provider="CPUExecutionProvider"
+            "CompVis/stable-diffusion-v1-4", revision="onnx", provider="CUDAExecutionProvider"
         )
         pipe.set_progress_bar_config(disable=None)
 
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
index 25f1b7574265..3ffbfc3d4f18 100644
--- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
@@ -18,11 +18,15 @@
 import numpy as np
 
 from diffusers import OnnxStableDiffusionImg2ImgPipeline
-from diffusers.utils.testing_utils import load_image, require_onnxruntime, slow
+from diffusers.utils.testing_utils import is_onnx_available, load_image, require_onnxruntime, require_torch_gpu, slow
 
 from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
 
 
+if is_onnx_available():
+    import onnxruntime as ort
+
+
 class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase):
     # FIXME: add fast tests
     pass
@@ -30,6 +34,7 @@ class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.Tes
 
 @slow
 @require_onnxruntime
+@require_torch_gpu
 class OnnxStableDiffusionPipelineIntegrationTests(unittest.TestCase):
     def test_inference(self):
         init_image = load_image(
@@ -37,8 +42,20 @@ def test_inference(self):
             "/img2img/sketch-mountains-input.jpg"
         )
         init_image = init_image.resize((768, 512))
+        provider = (
+            "CUDAExecutionProvider",
+            {
+                "gpu_mem_limit": "17179869184",  # 16GB.
+                "arena_extend_strategy": "kSameAsRequested",
+            },
+        )
+        options = ort.SessionOptions()
+        options.enable_mem_pattern = False
         pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="onnx", provider="CPUExecutionProvider"
+            "CompVis/stable-diffusion-v1-4",
+            revision="onnx",
+            provider=provider,
+            sess_options=options,
         )
         pipe.set_progress_bar_config(disable=None)
 
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
index 3f33022c112e..81cbed4e510d 100644
--- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
@@ -18,11 +18,15 @@
 import numpy as np
 
 from diffusers import OnnxStableDiffusionInpaintPipeline
-from diffusers.utils.testing_utils import load_image, require_onnxruntime, slow
+from diffusers.utils.testing_utils import is_onnx_available, load_image, require_onnxruntime, require_torch_gpu, slow
 
 from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
 
 
+if is_onnx_available():
+    import onnxruntime as ort
+
+
 class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase):
     # FIXME: add fast tests
     pass
@@ -30,6 +34,7 @@ class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.Tes
 
 @slow
 @require_onnxruntime
+@require_torch_gpu
 class OnnxStableDiffusionPipelineIntegrationTests(unittest.TestCase):
     def test_stable_diffusion_inpaint_onnx(self):
         init_image = load_image(
@@ -40,9 +45,20 @@ def test_stable_diffusion_inpaint_onnx(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
         )
-
+        provider = (
+            "CUDAExecutionProvider",
+            {
+                "gpu_mem_limit": "17179869184",  # 16GB.
+                "arena_extend_strategy": "kSameAsRequested",
+            },
+        )
+        options = ort.SessionOptions()
+        options.enable_mem_pattern = False
         pipe = OnnxStableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting", revision="onnx", provider="CPUExecutionProvider"
+            "runwayml/stable-diffusion-inpainting",
+            revision="onnx",
+            provider=provider,
+            sess_options=options,
         )
         pipe.set_progress_bar_config(disable=None)
 
diff --git a/tests/test_pipelines_flax.py b/tests/test_pipelines_flax.py
index 436e139d9162..ae52fa689bef 100644
--- a/tests/test_pipelines_flax.py
+++ b/tests/test_pipelines_flax.py
@@ -59,9 +59,9 @@ def test_dummy_all_tpus(self):
 
         images = p_sample(prompt_ids, params, prng_seed, num_inference_steps).images
 
-        assert images.shape == (8, 1, 64, 64, 3)
-        assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 4.151474)) < 1e-3
-        assert np.abs((np.abs(images, dtype=np.float32).sum() - 49947.875)) < 5e-1
+        assert images.shape == (8, 1, 128, 128, 3)
+        assert np.abs(np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 3.1111548) < 1e-3
+        assert np.abs(np.abs(images, dtype=np.float32).sum() - 199746.95) < 5e-1
 
         images_pil = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
 
diff --git a/tests/test_scheduler_flax.py b/tests/test_scheduler_flax.py
index d2feaa752ae2..d29a8bfcc210 100644
--- a/tests/test_scheduler_flax.py
+++ b/tests/test_scheduler_flax.py
@@ -22,9 +22,12 @@
 
 
 if is_flax_available():
+    import jax
     import jax.numpy as jnp
     from jax import random
 
+    jax_device = jax.default_backend()
+
 
 @require_flax
 class FlaxSchedulerCommonTest(unittest.TestCase):
@@ -308,8 +311,12 @@ def test_full_loop_no_noise(self):
         result_sum = jnp.sum(jnp.abs(sample))
         result_mean = jnp.mean(jnp.abs(sample))
 
-        assert abs(result_sum - 255.1113) < 1e-2
-        assert abs(result_mean - 0.332176) < 1e-3
+        if jax_device == "tpu":
+            assert abs(result_sum - 255.0714) < 1e-2
+            assert abs(result_mean - 0.332124) < 1e-3
+        else:
+            assert abs(result_sum - 255.1113) < 1e-2
+            assert abs(result_mean - 0.332176) < 1e-3
 
 
 @require_flax
@@ -570,8 +577,12 @@ def test_full_loop_with_set_alpha_to_one(self):
         result_sum = jnp.sum(jnp.abs(sample))
         result_mean = jnp.mean(jnp.abs(sample))
 
-        assert abs(result_sum - 149.8295) < 1e-2
-        assert abs(result_mean - 0.1951) < 1e-3
+        if jax_device == "tpu":
+            assert abs(result_sum - 149.8409) < 1e-2
+            assert abs(result_mean - 0.1951) < 1e-3
+        else:
+            assert abs(result_sum - 149.8295) < 1e-2
+            assert abs(result_mean - 0.1951) < 1e-3
 
     def test_full_loop_with_no_set_alpha_to_one(self):
         # We specify different beta, so that the first alpha is 0.99
@@ -579,8 +590,14 @@ def test_full_loop_with_no_set_alpha_to_one(self):
         result_sum = jnp.sum(jnp.abs(sample))
         result_mean = jnp.mean(jnp.abs(sample))
 
-        assert abs(result_sum - 149.0784) < 1e-2
-        assert abs(result_mean - 0.1941) < 1e-3
+        if jax_device == "tpu":
+            pass
+            # FIXME: both result_sum and result_mean are nan on TPU
+            # assert jnp.isnan(result_sum)
+            # assert jnp.isnan(result_mean)
+        else:
+            assert abs(result_sum - 149.0784) < 1e-2
+            assert abs(result_mean - 0.1941) < 1e-3
 
 
 @require_flax
@@ -841,8 +858,12 @@ def test_full_loop_no_noise(self):
         result_sum = jnp.sum(jnp.abs(sample))
         result_mean = jnp.mean(jnp.abs(sample))
 
-        assert abs(result_sum - 198.1318) < 1e-2
-        assert abs(result_mean - 0.2580) < 1e-3
+        if jax_device == "tpu":
+            assert abs(result_sum - 198.1542) < 1e-2
+            assert abs(result_mean - 0.2580) < 1e-3
+        else:
+            assert abs(result_sum - 198.1318) < 1e-2
+            assert abs(result_mean - 0.2580) < 1e-3
 
     def test_full_loop_with_set_alpha_to_one(self):
         # We specify different beta, so that the first alpha is 0.99
@@ -850,8 +871,12 @@ def test_full_loop_with_set_alpha_to_one(self):
         result_sum = jnp.sum(jnp.abs(sample))
         result_mean = jnp.mean(jnp.abs(sample))
 
-        assert abs(result_sum - 186.9466) < 1e-2
-        assert abs(result_mean - 0.24342) < 1e-3
+        if jax_device == "tpu":
+            assert abs(result_sum - 185.4352) < 1e-2
+            assert abs(result_mean - 0.24145) < 1e-3
+        else:
+            assert abs(result_sum - 186.9466) < 1e-2
+            assert abs(result_mean - 0.24342) < 1e-3
 
     def test_full_loop_with_no_set_alpha_to_one(self):
         # We specify different beta, so that the first alpha is 0.99
@@ -859,5 +884,9 @@ def test_full_loop_with_no_set_alpha_to_one(self):
         result_sum = jnp.sum(jnp.abs(sample))
         result_mean = jnp.mean(jnp.abs(sample))
 
-        assert abs(result_sum - 186.9482) < 1e-2
-        assert abs(result_mean - 0.2434) < 1e-3
+        if jax_device == "tpu":
+            assert abs(result_sum - 185.4352) < 1e-2
+            assert abs(result_mean - 0.2414) < 1e-3
+        else:
+            assert abs(result_sum - 186.9482) < 1e-2
+            assert abs(result_mean - 0.2434) < 1e-3

From 1216a3b122ba7f13062619c33254d2e737f379dc Mon Sep 17 00:00:00 2001
From: Omiita <77219025+omihub777@users.noreply.github.com>
Date: Wed, 2 Nov 2022 22:46:52 +0900
Subject: [PATCH 107/269] Fix a small typo of a variable name (#1063)

Fix a small typo

fix a typo in `models/attention.py`.
weight -> width
---
 src/diffusers/models/attention.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 1f9cf641c32d..372c8492b485 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -165,15 +165,15 @@ def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_atte
 
     def forward(self, hidden_states, context=None):
         # note: if no context is given, cross-attention defaults to self-attention
-        batch, channel, height, weight = hidden_states.shape
+        batch, channel, height, width = hidden_states.shape
         residual = hidden_states
         hidden_states = self.norm(hidden_states)
         hidden_states = self.proj_in(hidden_states)
         inner_dim = hidden_states.shape[1]
-        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
         for block in self.transformer_blocks:
             hidden_states = block(hidden_states, context=context)
-        hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2)
+        hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2)
         hidden_states = self.proj_out(hidden_states)
         return hidden_states + residual
 

From 5cd29d623ac38ccb3bdd8a5f654b85d4765d9751 Mon Sep 17 00:00:00 2001
From: Grigory Sizov <grisha.sizov@gmail.com>
Date: Wed, 2 Nov 2022 14:50:32 +0100
Subject: [PATCH 108/269] Fix tests for equivalence of DDIM and DDPM pipelines
 (#1069)

* Fix equality test for ddim and ddpm

* add docs for use_clipped_model_output in DDIM

* fix inline comment

* reorder imports in test_pipelines.py

* Ignore use_clipped_model_output if scheduler doesn't take it
---
 src/diffusers/pipelines/ddim/pipeline_ddim.py | 16 +++++++--
 src/diffusers/schedulers/scheduling_ddim.py   |  5 ++-
 tests/test_pipelines.py                       | 33 ++++++++++++++-----
 3 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/src/diffusers/pipelines/ddim/pipeline_ddim.py b/src/diffusers/pipelines/ddim/pipeline_ddim.py
index 74607fe87a3d..733a28c9f388 100644
--- a/src/diffusers/pipelines/ddim/pipeline_ddim.py
+++ b/src/diffusers/pipelines/ddim/pipeline_ddim.py
@@ -13,7 +13,7 @@
 
 # limitations under the License.
 
-
+import inspect
 from typing import Optional, Tuple, Union
 
 import torch
@@ -44,6 +44,7 @@ def __call__(
         generator: Optional[torch.Generator] = None,
         eta: float = 0.0,
         num_inference_steps: int = 50,
+        use_clipped_model_output: Optional[bool] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         **kwargs,
@@ -60,6 +61,9 @@ def __call__(
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
+            use_clipped_model_output (`bool`, *optional*, defaults to `None`):
+                if `True` or `False`, see documentation for `DDIMScheduler.step`. If `None`, nothing is passed
+                downstream to the scheduler. So use `None` for schedulers which don't support this argument.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generate image. Choose between
                 [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -82,6 +86,14 @@ def __call__(
         # set step values
         self.scheduler.set_timesteps(num_inference_steps)
 
+        # Ignore use_clipped_model_output if the scheduler doesn't accept this argument
+        accepts_use_clipped_model_output = "use_clipped_model_output" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        extra_kwargs = {}
+        if accepts_use_clipped_model_output:
+            extra_kwargs["use_clipped_model_output"] = use_clipped_model_output
+
         for t in self.progress_bar(self.scheduler.timesteps):
             # 1. predict noise model_output
             model_output = self.unet(image, t).sample
@@ -89,7 +101,7 @@ def __call__(
             # 2. predict previous mean of image x_t-1 and add variance depending on eta
             # eta corresponds to η in paper and should be between [0, 1]
             # do x_t -> x_t-1
-            image = self.scheduler.step(model_output, t, image, eta).prev_sample
+            image = self.scheduler.step(model_output, t, image, eta, **extra_kwargs).prev_sample
 
         image = (image / 2 + 0.5).clamp(0, 1)
         image = image.cpu().permute(0, 2, 3, 1).numpy()
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index f95c18d9fae2..23648d1bc3c0 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -220,7 +220,10 @@ def step(
             sample (`torch.FloatTensor`):
                 current instance of sample being created by diffusion process.
             eta (`float`): weight of noise for added noise in diffusion step.
-            use_clipped_model_output (`bool`): TODO
+            use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped
+                predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when
+                `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would
+                coincide with the one provided as input and `use_clipped_model_output` will have not effect.
             generator: random number generator.
             return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class
 
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index e355a19493ed..c11287339a5b 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -42,6 +42,7 @@
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from diffusers.utils import CONFIG_NAME, WEIGHTS_NAME, floats_tensor, slow, torch_device
 from diffusers.utils.testing_utils import CaptureLogger, get_tests_dir
+from parameterized import parameterized
 from PIL import Image
 from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
@@ -445,7 +446,9 @@ def test_output_format(self):
         assert isinstance(images, list)
         assert isinstance(images[0], PIL.Image.Image)
 
-    def test_ddpm_ddim_equality(self):
+    # Make sure the test passes for different values of random seed
+    @parameterized.expand([(0,), (4,)])
+    def test_ddpm_ddim_equality(self, seed):
         model_id = "google/ddpm-cifar10-32"
 
         unet = UNet2DModel.from_pretrained(model_id, device_map="auto")
@@ -459,17 +462,24 @@ def test_ddpm_ddim_equality(self):
         ddim.to(torch_device)
         ddim.set_progress_bar_config(disable=None)
 
-        generator = torch.manual_seed(0)
+        generator = torch.manual_seed(seed)
         ddpm_image = ddpm(generator=generator, output_type="numpy").images
 
-        generator = torch.manual_seed(0)
-        ddim_image = ddim(generator=generator, num_inference_steps=1000, eta=1.0, output_type="numpy").images
+        generator = torch.manual_seed(seed)
+        ddim_image = ddim(
+            generator=generator,
+            num_inference_steps=1000,
+            eta=1.0,
+            output_type="numpy",
+            use_clipped_model_output=True,  # Need this to make DDIM match DDPM
+        ).images
 
         # the values aren't exactly equal, but the images look the same visually
         assert np.abs(ddpm_image - ddim_image).max() < 1e-1
 
-    @unittest.skip("(Anton) The test is failing for large batch sizes, needs investigation")
-    def test_ddpm_ddim_equality_batched(self):
+    # Make sure the test passes for different values of random seed
+    @parameterized.expand([(0,), (4,)])
+    def test_ddpm_ddim_equality_batched(self, seed):
         model_id = "google/ddpm-cifar10-32"
 
         unet = UNet2DModel.from_pretrained(model_id, device_map="auto")
@@ -484,12 +494,17 @@ def test_ddpm_ddim_equality_batched(self):
         ddim.to(torch_device)
         ddim.set_progress_bar_config(disable=None)
 
-        generator = torch.manual_seed(0)
+        generator = torch.manual_seed(seed)
         ddpm_images = ddpm(batch_size=4, generator=generator, output_type="numpy").images
 
-        generator = torch.manual_seed(0)
+        generator = torch.manual_seed(seed)
         ddim_images = ddim(
-            batch_size=4, generator=generator, num_inference_steps=1000, eta=1.0, output_type="numpy"
+            batch_size=4,
+            generator=generator,
+            num_inference_steps=1000,
+            eta=1.0,
+            output_type="numpy",
+            use_clipped_model_output=True,  # Need this to make DDIM match DDPM
         ).images
 
         # the values aren't exactly equal, but the images look the same visually

From 33c487455e21d17b02aaca8cee22cbeca83f74ed Mon Sep 17 00:00:00 2001
From: Yuta Hayashibe <shirayu@users.noreply.github.com>
Date: Thu, 3 Nov 2022 00:37:05 +0900
Subject: [PATCH 109/269] Fix padding in dreambooth (#1030)

---
 examples/dreambooth/train_dreambooth.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index c37197fdc64c..9c512ef571d8 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -494,7 +494,12 @@ def collate_fn(examples):
         pixel_values = torch.stack(pixel_values)
         pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
 
-        input_ids = tokenizer.pad({"input_ids": input_ids}, padding=True, return_tensors="pt").input_ids
+        input_ids = tokenizer.pad(
+            {"input_ids": input_ids},
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
 
         batch = {
             "input_ids": input_ids,

From 0b61cea347e9b464fb03506cb78a49d38e1c74ee Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 2 Nov 2022 18:54:30 +0300
Subject: [PATCH 110/269] [Flax] time embedding (#1081)

* initial get_sinusoidal_embeddings

* added asserts

* better var name

* fix docs
---
 src/diffusers/models/embeddings_flax.py | 50 +++++++++++++++++--------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/src/diffusers/models/embeddings_flax.py b/src/diffusers/models/embeddings_flax.py
index e2d607499c79..1e2272c1fe70 100644
--- a/src/diffusers/models/embeddings_flax.py
+++ b/src/diffusers/models/embeddings_flax.py
@@ -17,23 +17,41 @@
 import jax.numpy as jnp
 
 
-# This is like models.embeddings.get_timestep_embedding (PyTorch) but
-# less general (only handles the case we currently need).
-def get_sinusoidal_embeddings(timesteps, embedding_dim, freq_shift: float = 1):
+def get_sinusoidal_embeddings(
+    timesteps: jnp.ndarray,
+    embedding_dim: int,
+    freq_shift: float = 1,
+    min_timescale: float = 1,
+    max_timescale: float = 1.0e4,
+    flip_sin_to_cos: bool = False,
+    scale: float = 1.0,
+) -> jnp.ndarray:
+    """Returns the positional encoding (same as Tensor2Tensor).
+    Args:
+        timesteps: a 1-D Tensor of N indices, one per batch element.
+        These may be fractional.
+        embedding_dim: The number of output channels.
+        min_timescale: The smallest time unit (should probably be 0.0).
+        max_timescale: The largest time unit.
+    Returns:
+        a Tensor of timing signals [N, num_channels]
     """
-    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+    assert timesteps.ndim == 1, "Timesteps should be a 1d-array"
+    assert embedding_dim % 2 == 0, f"Embedding dimension {embedding_dim} should be even"
+    num_timescales = float(embedding_dim // 2)
+    log_timescale_increment = math.log(max_timescale / min_timescale) / (num_timescales - freq_shift)
+    inv_timescales = min_timescale * jnp.exp(jnp.arange(num_timescales, dtype=jnp.float32) * -log_timescale_increment)
+    emb = jnp.expand_dims(timesteps, 1) * jnp.expand_dims(inv_timescales, 0)
 
-    :param timesteps: a 1-D tensor of N indices, one per batch element.
-                      These may be fractional.
-    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
-    embeddings. :return: an [N x dim] tensor of positional embeddings.
-    """
-    half_dim = embedding_dim // 2
-    emb = math.log(10000) / (half_dim - freq_shift)
-    emb = jnp.exp(jnp.arange(half_dim) * -emb)
-    emb = timesteps[:, None] * emb[None, :]
-    emb = jnp.concatenate([jnp.cos(emb), jnp.sin(emb)], -1)
-    return emb
+    # scale embeddings
+    scaled_time = scale * emb
+
+    if flip_sin_to_cos:
+        signal = jnp.concatenate([jnp.cos(scaled_time), jnp.sin(scaled_time)], axis=1)
+    else:
+        signal = jnp.concatenate([jnp.sin(scaled_time), jnp.cos(scaled_time)], axis=1)
+    signal = jnp.reshape(signal, [jnp.shape(timesteps)[0], embedding_dim])
+    return signal
 
 
 class FlaxTimestepEmbedding(nn.Module):
@@ -70,4 +88,4 @@ class FlaxTimesteps(nn.Module):
 
     @nn.compact
     def __call__(self, timesteps):
-        return get_sinusoidal_embeddings(timesteps, self.dim, freq_shift=self.freq_shift)
+        return get_sinusoidal_embeddings(timesteps, embedding_dim=self.dim, freq_shift=self.freq_shift)

From cbcd0512f04064bb25206ad844ebc4a9a022cad1 Mon Sep 17 00:00:00 2001
From: Denis <lukovnikov.denis@gmail.com>
Date: Wed, 2 Nov 2022 17:43:40 +0100
Subject: [PATCH 111/269] Training to predict x0 in training example (#1031)

* changed training example to add option to train model that predicts x0 (instead of eps), changed DDPM pipeline accordingly

* Revert "changed training example to add option to train model that predicts x0 (instead of eps), changed DDPM pipeline accordingly"

This reverts commit c5efb525648885f2e7df71f4483a9f248515ad61.

* changed training example to add option to train model that predicts x0 (instead of eps), changed DDPM pipeline accordingly

* fixed code style

Co-authored-by: lukovnikov <lukovnikov@users.noreply.github.com>
---
 .../train_unconditional.py                    | 55 +++++++++++++++++--
 src/diffusers/pipelines/ddpm/pipeline_ddpm.py |  5 +-
 2 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py
index 2bc8114caca8..3f9ffb11ef45 100644
--- a/examples/unconditional_image_generation/train_unconditional.py
+++ b/examples/unconditional_image_generation/train_unconditional.py
@@ -29,6 +29,24 @@
 logger = get_logger(__name__)
 
 
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    if not isinstance(arr, torch.Tensor):
+        arr = torch.from_numpy(arr)
+    res = arr[timesteps].float().to(timesteps.device)
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res.expand(broadcast_shape)
+
+
 def parse_args():
     parser = argparse.ArgumentParser(description="Simple example of a training script.")
     parser.add_argument(
@@ -171,6 +189,16 @@ def parse_args():
         ),
     )
 
+    parser.add_argument(
+        "--predict_mode",
+        type=str,
+        default="eps",
+        help="What the model should predict. 'eps' to predict error, 'x0' to directly predict reconstruction",
+    )
+
+    parser.add_argument("--ddpm_num_steps", type=int, default=1000)
+    parser.add_argument("--ddpm_beta_schedule", type=str, default="linear")
+
     args = parser.parse_args()
     env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
     if env_local_rank != -1 and env_local_rank != args.local_rank:
@@ -224,7 +252,7 @@ def main(args):
             "UpBlock2D",
         ),
     )
-    noise_scheduler = DDPMScheduler(num_train_timesteps=1000)
+    noise_scheduler = DDPMScheduler(num_train_timesteps=args.ddpm_num_steps, beta_schedule=args.ddpm_beta_schedule)
     optimizer = torch.optim.AdamW(
         model.parameters(),
         lr=args.learning_rate,
@@ -257,6 +285,8 @@ def transforms(examples):
         images = [augmentations(image.convert("RGB")) for image in examples["image"]]
         return {"input": images}
 
+    logger.info(f"Dataset size: {len(dataset)}")
+
     dataset.set_transform(transforms)
     train_dataloader = torch.utils.data.DataLoader(
         dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers
@@ -319,8 +349,20 @@ def transforms(examples):
 
             with accelerator.accumulate(model):
                 # Predict the noise residual
-                noise_pred = model(noisy_images, timesteps).sample
-                loss = F.mse_loss(noise_pred, noise)
+                model_output = model(noisy_images, timesteps).sample
+
+                if args.predict_mode == "eps":
+                    loss = F.mse_loss(model_output, noise)  # this could have different weights!
+                elif args.predict_mode == "x0":
+                    alpha_t = _extract_into_tensor(
+                        noise_scheduler.alphas_cumprod, timesteps, (clean_images.shape[0], 1, 1, 1)
+                    )
+                    snr_weights = alpha_t / (1 - alpha_t)
+                    loss = snr_weights * F.mse_loss(
+                        model_output, clean_images, reduction="none"
+                    )  # use SNR weighting from distillation paper
+                    loss = loss.mean()
+
                 accelerator.backward(loss)
 
                 if accelerator.sync_gradients:
@@ -355,7 +397,12 @@ def transforms(examples):
 
                 generator = torch.manual_seed(0)
                 # run pipeline in inference (sample random noise and denoise)
-                images = pipeline(generator=generator, batch_size=args.eval_batch_size, output_type="numpy").images
+                images = pipeline(
+                    generator=generator,
+                    batch_size=args.eval_batch_size,
+                    output_type="numpy",
+                    predict_epsilon=args.predict_mode == "eps",
+                ).images
 
                 # denormalize the images and save to tensorboard
                 images_processed = (images * 255).round().astype("uint8")
diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
index 85db691dc657..811614ecbdde 100644
--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -45,6 +45,7 @@ def __call__(
         num_inference_steps: int = 1000,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
+        predict_epsilon: bool = True,
         **kwargs,
     ) -> Union[ImagePipelineOutput, Tuple]:
         r"""
@@ -84,7 +85,9 @@ def __call__(
             model_output = self.unet(image, t).sample
 
             # 2. compute previous image: x_t -> x_t-1
-            image = self.scheduler.step(model_output, t, image, generator=generator).prev_sample
+            image = self.scheduler.step(
+                model_output, t, image, generator=generator, predict_epsilon=predict_epsilon
+            ).prev_sample
 
         image = (image / 2 + 0.5).clamp(0, 1)
         image = image.cpu().permute(0, 2, 3, 1).numpy()

From c39a511b5ff7d37e594aad22b8ac85d0fef20406 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 2 Nov 2022 19:20:42 +0100
Subject: [PATCH 112/269] [Loading] Ignore unneeded files (#1107)

* [Loading] Ignore unneeded files

* up
---
 src/diffusers/pipeline_flax_utils.py | 12 +++++++++++-
 src/diffusers/pipeline_utils.py      |  4 ++++
 tests/test_pipelines.py              | 16 ++++++++++++++++
 tests/test_pipelines_flax.py         | 21 ++++++++++++++++++++-
 4 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipeline_flax_utils.py b/src/diffusers/pipeline_flax_utils.py
index 80555f278505..c281c772dbd2 100644
--- a/src/diffusers/pipeline_flax_utils.py
+++ b/src/diffusers/pipeline_flax_utils.py
@@ -302,10 +302,19 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             allow_patterns = [os.path.join(k, "*") for k in folder_names]
             allow_patterns += [FLAX_WEIGHTS_NAME, SCHEDULER_CONFIG_NAME, CONFIG_NAME, cls.config_name]
 
+            # make sure we don't download PyTorch weights
+            ignore_patterns = "*.bin"
+
             if cls != FlaxDiffusionPipeline:
                 requested_pipeline_class = cls.__name__
             else:
                 requested_pipeline_class = config_dict.get("_class_name", cls.__name__)
+                requested_pipeline_class = (
+                    requested_pipeline_class
+                    if requested_pipeline_class.startswith("Flax")
+                    else "Flax" + requested_pipeline_class
+                )
+
             user_agent = {"pipeline_class": requested_pipeline_class}
             user_agent = http_user_agent(user_agent)
 
@@ -319,6 +328,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 use_auth_token=use_auth_token,
                 revision=revision,
                 allow_patterns=allow_patterns,
+                ignore_patterns=ignore_patterns,
                 user_agent=user_agent,
             )
         else:
@@ -337,7 +347,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 if config_dict["_class_name"].startswith("Flax")
                 else "Flax" + config_dict["_class_name"]
             )
-            pipeline_class = getattr(diffusers_module, config_dict["_class_name"])
+            pipeline_class = getattr(diffusers_module, class_name)
 
         # some modules can be passed directly to the init
         # in this case they are already instantiated in `kwargs`
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 5c94df25cca0..94c1e135abe5 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -395,6 +395,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             allow_patterns = [os.path.join(k, "*") for k in folder_names]
             allow_patterns += [WEIGHTS_NAME, SCHEDULER_CONFIG_NAME, CONFIG_NAME, ONNX_WEIGHTS_NAME, cls.config_name]
 
+            # make sure we don't download flax weights
+            ignore_patterns = "*.msgpack"
+
             if custom_pipeline is not None:
                 allow_patterns += [CUSTOM_PIPELINE_FILE_NAME]
 
@@ -417,6 +420,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 use_auth_token=use_auth_token,
                 revision=revision,
                 allow_patterns=allow_patterns,
+                ignore_patterns=ignore_patterns,
                 user_agent=user_agent,
             )
         else:
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index c11287339a5b..1654518f1e93 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -73,6 +73,22 @@ def test_progress_bar(capsys):
     assert captured.err == "", "Progress bar should be disabled"
 
 
+class DownloadTests(unittest.TestCase):
+    def test_download_only_pytorch(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            # pipeline has Flax weights
+            _ = DiffusionPipeline.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None, cache_dir=tmpdirname
+            )
+
+            all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname, os.listdir(tmpdirname)[0], "snapshots"))]
+            files = [item for sublist in all_root_files for item in sublist]
+
+            # None of the downloaded files should be a flax file even if we have some here:
+            # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe/blob/main/unet/diffusion_flax_model.msgpack
+            assert not any(f.endswith(".msgpack") for f in files)
+
+
 class CustomPipelineTests(unittest.TestCase):
     def test_load_custom_pipeline(self):
         pipeline = DiffusionPipeline.from_pretrained(
diff --git a/tests/test_pipelines_flax.py b/tests/test_pipelines_flax.py
index ae52fa689bef..ac5e2621a514 100644
--- a/tests/test_pipelines_flax.py
+++ b/tests/test_pipelines_flax.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import tempfile
 import unittest
 
 import numpy as np
@@ -24,12 +26,29 @@
 if is_flax_available():
     import jax
     import jax.numpy as jnp
-    from diffusers import FlaxDDIMScheduler, FlaxStableDiffusionPipeline
+    from diffusers import FlaxDDIMScheduler, FlaxDiffusionPipeline, FlaxStableDiffusionPipeline
     from flax.jax_utils import replicate
     from flax.training.common_utils import shard
     from jax import pmap
 
 
+@require_flax
+class DownloadTests(unittest.TestCase):
+    def test_download_only_pytorch(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            # pipeline has Flax weights
+            _ = FlaxDiffusionPipeline.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None, cache_dir=tmpdirname
+            )
+
+            all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname, os.listdir(tmpdirname)[0], "snapshots"))]
+            files = [item for sublist in all_root_files for item in sublist]
+
+            # None of the downloaded files should be a PyTorch file even if we have some here:
+            # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe/blob/main/unet/diffusion_pytorch_model.bin
+            assert not any(f.endswith(".bin") for f in files)
+
+
 @slow
 @require_flax
 class FlaxPipelineTests(unittest.TestCase):

From 0edf9ca082b0b405435767bc9e96b49b15390fd9 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Thu, 3 Nov 2022 15:24:32 +0100
Subject: [PATCH 113/269] Fix hub-dependent tests for PRs (#1119)

* Remove the hub token

* replace repos

* style
---
 .github/workflows/pr_tests.yml |  8 --------
 tests/test_config.py           | 14 ++++++++------
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
index 242e9552d934..c978efe3b7db 100644
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -67,8 +67,6 @@ jobs:
 
     - name: Run fast PyTorch CPU tests
       if: ${{ matrix.config.framework == 'pytorch' }}
-      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       run: |
         python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "not Flax and not Onnx" \
@@ -77,8 +75,6 @@ jobs:
 
     - name: Run fast Flax TPU tests
       if: ${{ matrix.config.framework == 'flax' }}
-      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       run: |
         python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "Flax" \
@@ -87,8 +83,6 @@ jobs:
 
     - name: Run fast ONNXRuntime CPU tests
       if: ${{ matrix.config.framework == 'onnxruntime' }}
-      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       run: |
         python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
           -s -v -k "Onnx" \
@@ -141,8 +135,6 @@ jobs:
 
     - name: Run fast PyTorch tests on M1 (MPS)
       shell: arch -arch arm64 bash {0}
-      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
       run: |
         ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps tests/
 
diff --git a/tests/test_config.py b/tests/test_config.py
index 1c6178d7208a..7a9f270af364 100755
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -244,28 +244,30 @@ def test_load_ddim_from_pndm(self):
         logger = logging.get_logger("diffusers.configuration_utils")
 
         with CaptureLogger(logger) as cap_logger:
-            ddim = DDIMScheduler.from_config("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
+            ddim = DDIMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
 
         assert ddim.__class__ == DDIMScheduler
         # no warning should be thrown
         assert cap_logger.out == ""
 
-    def test_load_ddim_from_euler(self):
+    def test_load_euler_from_pndm(self):
         logger = logging.get_logger("diffusers.configuration_utils")
 
         with CaptureLogger(logger) as cap_logger:
-            euler = EulerDiscreteScheduler.from_config("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
+            euler = EulerDiscreteScheduler.from_config(
+                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
+            )
 
         assert euler.__class__ == EulerDiscreteScheduler
         # no warning should be thrown
         assert cap_logger.out == ""
 
-    def test_load_ddim_from_euler_ancestral(self):
+    def test_load_euler_ancestral_from_pndm(self):
         logger = logging.get_logger("diffusers.configuration_utils")
 
         with CaptureLogger(logger) as cap_logger:
             euler = EulerAncestralDiscreteScheduler.from_config(
-                "runwayml/stable-diffusion-v1-5", subfolder="scheduler"
+                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
             )
 
         assert euler.__class__ == EulerAncestralDiscreteScheduler
@@ -276,7 +278,7 @@ def test_load_pndm(self):
         logger = logging.get_logger("diffusers.configuration_utils")
 
         with CaptureLogger(logger) as cap_logger:
-            pndm = PNDMScheduler.from_config("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
+            pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
 
         assert pndm.__class__ == PNDMScheduler
         # no warning should be thrown

From 4a38166afeea498e8d333d40264e778ac5b16d81 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Thu, 3 Nov 2022 15:41:33 +0100
Subject: [PATCH 114/269] Allow saving `None` pipeline components (#1118)

* Allow saving `None` pipeline components

* support flax as well

* style
---
 src/diffusers/pipeline_flax_utils.py                  |  9 +++++++++
 src/diffusers/pipeline_utils.py                       |  9 +++++++++
 .../stable_diffusion/test_stable_diffusion.py         | 11 +++++++++++
 3 files changed, 29 insertions(+)

diff --git a/src/diffusers/pipeline_flax_utils.py b/src/diffusers/pipeline_flax_utils.py
index c281c772dbd2..e63009b49c8f 100644
--- a/src/diffusers/pipeline_flax_utils.py
+++ b/src/diffusers/pipeline_flax_utils.py
@@ -161,6 +161,10 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], params: Union
 
         for pipeline_component_name in model_index_dict.keys():
             sub_model = getattr(self, pipeline_component_name)
+            if sub_model is None:
+                # edge case for saving a pipeline with safety_checker=None
+                continue
+
             model_cls = sub_model.__class__
 
             save_method_name = None
@@ -367,6 +371,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
         # 3. Load each module in the pipeline
         for name, (library_name, class_name) in init_dict.items():
+            if class_name is None:
+                # edge case for when the pipeline was saved with safety_checker=None
+                init_kwargs[name] = None
+                continue
+
             is_pipeline_module = hasattr(pipelines, library_name)
             loaded_sub_model = None
             sub_model_should_be_defined = True
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 94c1e135abe5..4ba8d2d9300d 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -176,6 +176,10 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike]):
 
         for pipeline_component_name in model_index_dict.keys():
             sub_model = getattr(self, pipeline_component_name)
+            if sub_model is None:
+                # edge case for saving a pipeline with safety_checker=None
+                continue
+
             model_cls = sub_model.__class__
 
             save_method_name = None
@@ -477,6 +481,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
         # 3. Load each module in the pipeline
         for name, (library_name, class_name) in init_dict.items():
+            if class_name is None:
+                # edge case for when the pipeline was saved with safety_checker=None
+                init_kwargs[name] = None
+                continue
+
             # 3.1 - now that JAX/Flax is an official framework of the library, we might load from Flax names
             if class_name.startswith("Flax"):
                 class_name = class_name[4:]
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 260d58e94b04..ded2470cc2e5 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -15,6 +15,7 @@
 
 import gc
 import random
+import tempfile
 import time
 import unittest
 
@@ -318,6 +319,16 @@ def test_stable_diffusion_no_safety_checker(self):
         image = pipe("example prompt", num_inference_steps=2).images[0]
         assert image is not None
 
+        # check that there's no error when saving a pipeline with one of the models being None
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
+
+        # sanity check that the pipeline still works
+        assert pipe.safety_checker is None
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None
+
     def test_stable_diffusion_k_lms(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         unet = self.dummy_cond_unet

From d38c804320192c3844ff0bc7deed83e8b8cb7856 Mon Sep 17 00:00:00 2001
From: Revist <revistoo@gmail.com>
Date: Thu, 3 Nov 2022 15:42:46 +0100
Subject: [PATCH 115/269] feat: add repaint (#974)

* feat: add repaint

* fix: fix quality check with `make fix-copies`

* fix: remove old unnecessary arg

* chore: change default to DDPM (looks better in experiments)

* ".to(device)" changed to "device="

Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>

* make generator device-specific

Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>

* make generator device-specific and change shape

Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>

* fix: add preprocessing for image and mask

Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>

* fix: update test

Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>

* Update src/diffusers/pipelines/repaint/pipeline_repaint.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* Add docs and examples

* Fix toctree

Co-authored-by: fja <fja@zurich.ibm.com>
Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Anton Lozhkov <anton@huggingface.co>
---
 docs/source/_toctree.yml                      |   2 +
 docs/source/api/pipelines/overview.mdx        |  28 +-
 docs/source/api/pipelines/repaint.mdx         |  77 +++++
 docs/source/api/schedulers.mdx                |  12 +-
 src/diffusers/__init__.py                     |   2 +
 src/diffusers/pipelines/__init__.py           |   1 +
 src/diffusers/pipelines/repaint/__init__.py   |   1 +
 .../pipelines/repaint/pipeline_repaint.py     | 140 ++++++++
 src/diffusers/schedulers/__init__.py          |   1 +
 .../schedulers/scheduling_repaint.py          | 322 ++++++++++++++++++
 src/diffusers/utils/dummy_pt_objects.py       |  30 ++
 tests/pipelines/repaint/__init__.py           |   0
 tests/pipelines/repaint/test_repaint.py       |  65 ++++
 13 files changed, 667 insertions(+), 14 deletions(-)
 create mode 100644 docs/source/api/pipelines/repaint.mdx
 create mode 100644 src/diffusers/pipelines/repaint/__init__.py
 create mode 100644 src/diffusers/pipelines/repaint/pipeline_repaint.py
 create mode 100644 src/diffusers/schedulers/scheduling_repaint.py
 create mode 100644 tests/pipelines/repaint/__init__.py
 create mode 100644 tests/pipelines/repaint/test_repaint.py

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 7e46d95a46f1..331d4fff7870 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -96,5 +96,7 @@
       title: "Stochastic Karras VE"
     - local: api/pipelines/dance_diffusion
       title: "Dance Diffusion"
+    - local: api/pipelines/repaint
+      title: "RePaint"
     title: "Pipelines"
   title: "API"
diff --git a/docs/source/api/pipelines/overview.mdx b/docs/source/api/pipelines/overview.mdx
index 9bb351d93c8d..a53a2f8b4137 100644
--- a/docs/source/api/pipelines/overview.mdx
+++ b/docs/source/api/pipelines/overview.mdx
@@ -41,19 +41,21 @@ If you are looking for *official* training examples, please have a look at [exam
 The following table summarizes all officially supported pipelines, their corresponding paper, and if 
 available a colab notebook to directly try them out.
 
-| Pipeline | Paper | Tasks | Colab
-|---|---|:---:|:---:|
-| [ddpm](./ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
-| [ddim](./ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
-| [latent_diffusion](./latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | 
-| [latent_diffusion_uncond](./latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | 
-| [pndm](./pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation | 
-| [score_sde_ve](./score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
-| [score_sde_vp](./score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
-| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
-| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
-| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
-| [stochastic_karras_ve](./stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation | 
+| Pipeline                                             | Paper                                                                                                                        |                 Tasks                 | Colab
+|------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------:|:---:|
+| [ddpm](./ddpm)                                       | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239)                                             |    Unconditional Image Generation     |
+| [ddim](./ddim)                                       | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502)                                                  |    Unconditional Image Generation     | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [latent_diffusion](./latent_diffusion)               | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)                         |       Text-to-Image Generation        | 
+| [latent_diffusion_uncond](./latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)                         |    Unconditional Image Generation     | 
+| [pndm](./pndm)                                       | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778)                           |    Unconditional Image Generation     | 
+| [score_sde_ve](./score_sde_ve)                       | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) |    Unconditional Image Generation     | 
+| [score_sde_vp](./score_sde_vp)                       | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) |    Unconditional Image Generation     | 
+| [stable_diffusion](./stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            |       Text-to-Image Generation        | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [stable_diffusion](./stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+| [stable_diffusion](./stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            |     Text-Guided Image Inpainting      | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+| [stochastic_karras_ve](./stochastic_karras_ve)       | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364)                    |    Unconditional Image Generation     | 
+| [repaint](./repaint)                                 | [**RePaint: Inpainting using Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2201.09865)                   |           Image Inpainting            | 
+
 
 **Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers. 
 
diff --git a/docs/source/api/pipelines/repaint.mdx b/docs/source/api/pipelines/repaint.mdx
new file mode 100644
index 000000000000..0b7de8a4579d
--- /dev/null
+++ b/docs/source/api/pipelines/repaint.mdx
@@ -0,0 +1,77 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# RePaint
+
+## Overview
+
+[RePaint: Inpainting using Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2201.09865) (PNDM) by Andreas Lugmayr, Martin Danelljan, Andres Romero, Fisher Yu, Radu Timofte, Luc Van Gool.
+
+The abstract of the paper is the following:
+
+Free-form inpainting is the task of adding new content to an image in the regions specified by an arbitrary binary mask. Most existing approaches train for a certain distribution of masks, which limits their generalization capabilities to unseen mask types. Furthermore, training with pixel-wise and perceptual losses often leads to simple textural extensions towards the missing areas instead of semantically meaningful generation. In this work, we propose RePaint: A Denoising Diffusion Probabilistic Model (DDPM) based inpainting approach that is applicable to even extreme masks. We employ a pretrained unconditional DDPM as the generative prior. To condition the generation process, we only alter the reverse diffusion iterations by sampling the unmasked regions using the given image information. Since this technique does not modify or condition the original DDPM network itself, the model produces high-quality and diverse output images for any inpainting form. We validate our method for both faces and general-purpose image inpainting using standard and extreme masks.
+RePaint outperforms state-of-the-art Autoregressive, and GAN approaches for at least five out of six mask distributions.
+
+The original codebase can be found [here](https://github.com/andreas128/RePaint).
+
+## Available Pipelines:
+
+| Pipeline                                                                                                                      | Tasks              | Colab
+|-------------------------------------------------------------------------------------------------------------------------------|--------------------|:---:|
+| [pipeline_repaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/repaint/pipeline_repaint.py) | *Image Inpainting* | - |
+
+## Usage example
+
+```python
+from io import BytesIO
+
+import torch
+
+import PIL
+import requests
+from diffusers import RePaintPipeline, RePaintScheduler
+
+
+def download_image(url):
+    response = requests.get(url)
+    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+img_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/celeba_hq_256.png"
+mask_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/mask_256.png"
+
+# Load the original image and the mask as PIL images
+original_image = download_image(img_url).resize((256, 256))
+mask_image = download_image(mask_url).resize((256, 256))
+
+# Load the RePaint scheduler and pipeline based on a pretrained DDPM model
+scheduler = RePaintScheduler.from_config("google/ddpm-ema-celebahq-256")
+pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler)
+pipe = pipe.to("cuda")
+
+generator = torch.Generator(device="cuda").manual_seed(0)
+output = pipe(
+    original_image=original_image,
+    mask_image=mask_image,
+    num_inference_steps=250,
+    eta=0.0,
+    jump_length=10,
+    jump_n_sample=10,
+    generator=generator,
+)
+inpainted_image = output.images[0]
+```
+
+## RePaintPipeline
+[[autodoc]] pipelines.repaint.pipeline_repaint.RePaintPipeline
+    - __call__
+
diff --git a/docs/source/api/schedulers.mdx b/docs/source/api/schedulers.mdx
index 6616a3e51538..6e7da10e33e5 100644
--- a/docs/source/api/schedulers.mdx
+++ b/docs/source/api/schedulers.mdx
@@ -127,4 +127,14 @@ Fast scheduler which often times generates good outputs with 20-30 steps.
 Ancestral sampling with Euler method steps. Based on the original (k-diffusion)[https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L72] implementation by Katherine Crowson.
 Fast scheduler which often times generates good outputs with 20-30 steps.
 
-[[autodoc]] EulerAncestralDiscreteScheduler
\ No newline at end of file
+[[autodoc]] EulerAncestralDiscreteScheduler
+
+
+#### RePaint scheduler
+
+DDPM-based inpainting scheduler for unsupervised inpainting with extreme masks. 
+Intended for use with [`RePaintPipeline`].
+Based on the paper [RePaint: Inpainting using Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2201.09865) 
+and the original implementation by Andreas Lugmayr et al.: https://github.com/andreas128/RePaint
+
+[[autodoc]] RePaintScheduler
\ No newline at end of file
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 49c3e82b8e7b..1a9a7d74ebee 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -36,6 +36,7 @@
         KarrasVePipeline,
         LDMPipeline,
         PNDMPipeline,
+        RePaintPipeline,
         ScoreSdeVePipeline,
     )
     from .schedulers import (
@@ -46,6 +47,7 @@
         IPNDMScheduler,
         KarrasVeScheduler,
         PNDMScheduler,
+        RePaintScheduler,
         SchedulerMixin,
         ScoreSdeVeScheduler,
     )
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index b3124af39077..8015d4e11466 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -7,6 +7,7 @@
     from .ddpm import DDPMPipeline
     from .latent_diffusion_uncond import LDMPipeline
     from .pndm import PNDMPipeline
+    from .repaint import RePaintPipeline
     from .score_sde_ve import ScoreSdeVePipeline
     from .stochastic_karras_ve import KarrasVePipeline
 else:
diff --git a/src/diffusers/pipelines/repaint/__init__.py b/src/diffusers/pipelines/repaint/__init__.py
new file mode 100644
index 000000000000..16bc86d1cedf
--- /dev/null
+++ b/src/diffusers/pipelines/repaint/__init__.py
@@ -0,0 +1 @@
+from .pipeline_repaint import RePaintPipeline
diff --git a/src/diffusers/pipelines/repaint/pipeline_repaint.py b/src/diffusers/pipelines/repaint/pipeline_repaint.py
new file mode 100644
index 000000000000..7af88f627559
--- /dev/null
+++ b/src/diffusers/pipelines/repaint/pipeline_repaint.py
@@ -0,0 +1,140 @@
+# Copyright 2022 ETH Zurich Computer Vision Lab and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+import PIL
+from tqdm.auto import tqdm
+
+from ...models import UNet2DModel
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ...schedulers import RePaintScheduler
+
+
+def _preprocess_image(image: PIL.Image.Image):
+    image = np.array(image.convert("RGB"))
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+    return image
+
+
+def _preprocess_mask(mask: PIL.Image.Image):
+    mask = np.array(mask.convert("L"))
+    mask = mask.astype(np.float32) / 255.0
+    mask = mask[None, None]
+    mask[mask < 0.5] = 0
+    mask[mask >= 0.5] = 1
+    mask = torch.from_numpy(mask)
+    return mask
+
+
+class RePaintPipeline(DiffusionPipeline):
+    unet: UNet2DModel
+    scheduler: RePaintScheduler
+
+    def __init__(self, unet, scheduler):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        original_image: Union[torch.FloatTensor, PIL.Image.Image],
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image],
+        num_inference_steps: int = 250,
+        eta: float = 0.0,
+        jump_length: int = 10,
+        jump_n_sample: int = 10,
+        generator: Optional[torch.Generator] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        Args:
+            original_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                The original image to inpaint on.
+            mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                The mask_image where 0.0 values define which part of the original image to inpaint (change).
+            num_inference_steps (`int`, *optional*, defaults to 1000):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            eta (`float`):
+                The weight of noise for added noise in a diffusion step. Its value is between 0.0 and 1.0 - 0.0 is DDIM
+                and 1.0 is DDPM scheduler respectively.
+            jump_length (`int`, *optional*, defaults to 10):
+                The number of steps taken forward in time before going backward in time for a single jump ("j" in
+                RePaint paper). Take a look at Figure 9 and 10 in https://arxiv.org/pdf/2201.09865.pdf.
+            jump_n_sample (`int`, *optional*, defaults to 10):
+                The number of times we will make forward time jump for a given chosen time sample. Take a look at
+                Figure 9 and 10 in https://arxiv.org/pdf/2201.09865.pdf.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+
+        if not isinstance(original_image, torch.FloatTensor):
+            original_image = _preprocess_image(original_image)
+        original_image = original_image.to(self.device)
+        if not isinstance(mask_image, torch.FloatTensor):
+            mask_image = _preprocess_mask(mask_image)
+        mask_image = mask_image.to(self.device)
+
+        # sample gaussian noise to begin the loop
+        image = torch.randn(
+            original_image.shape,
+            generator=generator,
+            device=self.device,
+        )
+        image = image.to(self.device)
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps, jump_length, jump_n_sample, self.device)
+        self.scheduler.eta = eta
+
+        t_last = self.scheduler.timesteps[0] + 1
+        for i, t in enumerate(tqdm(self.scheduler.timesteps)):
+            if t < t_last:
+                # predict the noise residual
+                model_output = self.unet(image, t).sample
+                # compute previous image: x_t -> x_t-1
+                image = self.scheduler.step(model_output, t, image, original_image, mask_image, generator).prev_sample
+
+            else:
+                # compute the reverse: x_t-1 -> x_t
+                image = self.scheduler.undo_step(image, t_last, generator)
+            t_last = t
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/src/diffusers/schedulers/__init__.py b/src/diffusers/schedulers/__init__.py
index c3999d2cac61..a1915ed8d2b1 100644
--- a/src/diffusers/schedulers/__init__.py
+++ b/src/diffusers/schedulers/__init__.py
@@ -24,6 +24,7 @@
     from .scheduling_ipndm import IPNDMScheduler
     from .scheduling_karras_ve import KarrasVeScheduler
     from .scheduling_pndm import PNDMScheduler
+    from .scheduling_repaint import RePaintScheduler
     from .scheduling_sde_ve import ScoreSdeVeScheduler
     from .scheduling_sde_vp import ScoreSdeVpScheduler
     from .scheduling_utils import SchedulerMixin
diff --git a/src/diffusers/schedulers/scheduling_repaint.py b/src/diffusers/schedulers/scheduling_repaint.py
new file mode 100644
index 000000000000..1751f41c66f2
--- /dev/null
+++ b/src/diffusers/schedulers/scheduling_repaint.py
@@ -0,0 +1,322 @@
+# Copyright 2022 ETH Zurich Computer Vision Lab and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .scheduling_utils import SchedulerMixin
+
+
+@dataclass
+class RePaintSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's step function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample (x_{0}) based on the model output from
+             the current timestep. `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.FloatTensor
+    pred_original_sample: torch.FloatTensor
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+
+    def alpha_bar(time_step):
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class RePaintScheduler(SchedulerMixin, ConfigMixin):
+    """
+    RePaint is a schedule for DDPM inpainting inside a given mask.
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+    [`~ConfigMixin.from_config`] functions.
+
+    For more details, see the original paper: https://arxiv.org/pdf/2201.09865.pdf
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        eta (`float`):
+            The weight of noise for added noise in a diffusion step. Its value is between 0.0 and 1.0 -0.0 is DDIM and
+            1.0 is DDPM scheduler respectively.
+        trained_betas (`np.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        variance_type (`str`):
+            options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small`,
+            `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
+        clip_sample (`bool`, default `True`):
+            option to clip predicted sample between -1 and 1 for numerical stability.
+
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        eta: float = 0.0,
+        trained_betas: Optional[np.ndarray] = None,
+        clip_sample: bool = True,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.from_numpy(trained_betas)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = (
+                torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+            )
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        elif beta_schedule == "sigmoid":
+            # GeoDiff sigmoid schedule
+            betas = torch.linspace(-6, 6, num_train_timesteps)
+            self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        self.one = torch.tensor(1.0)
+
+        self.final_alpha_cumprod = torch.tensor(1.0)
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+
+        self.eta = eta
+
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`): input sample
+            timestep (`int`, optional): current timestep
+
+        Returns:
+            `torch.FloatTensor`: scaled input sample
+        """
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int,
+        jump_length: int = 10,
+        jump_n_sample: int = 10,
+        device: Union[str, torch.device] = None,
+    ):
+        num_inference_steps = min(self.config.num_train_timesteps, num_inference_steps)
+        self.num_inference_steps = num_inference_steps
+
+        timesteps = []
+
+        jumps = {}
+        for j in range(0, num_inference_steps - jump_length, jump_length):
+            jumps[j] = jump_n_sample - 1
+
+        t = num_inference_steps
+        while t >= 1:
+            t = t - 1
+            timesteps.append(t)
+
+            if jumps.get(t, 0) > 0:
+                jumps[t] = jumps[t] - 1
+                for _ in range(jump_length):
+                    t = t + 1
+                    timesteps.append(t)
+
+        timesteps = np.array(timesteps) * (self.config.num_train_timesteps // self.num_inference_steps)
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    def _get_variance(self, t):
+        prev_timestep = t - self.config.num_train_timesteps // self.num_inference_steps
+
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        # For t > 0, compute predicted variance βt (see formula (6) and (7) from
+        # https://arxiv.org/pdf/2006.11239.pdf) and sample from it to get
+        # previous sample x_{t-1} ~ N(pred_prev_sample, variance) == add
+        # variance to pred_sample
+        # Is equivalent to formula (16) in https://arxiv.org/pdf/2010.02502.pdf
+        # without eta.
+        # variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[t]
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        original_image: torch.FloatTensor,
+        mask: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[RePaintSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`): direct output from learned
+                diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+            original_image (`torch.FloatTensor`):
+                the original image to inpaint on.
+            mask (`torch.FloatTensor`):
+                the mask where 0.0 values define which part of the original image to inpaint (change).
+            generator (`torch.Generator`, *optional*): random number generator.
+            return_dict (`bool`): option for returning tuple rather than
+                DDPMSchedulerOutput class
+
+        Returns:
+            [`~schedulers.scheduling_utils.RePaintSchedulerOutput`] or `tuple`:
+            [`~schedulers.scheduling_utils.RePaintSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+
+        """
+        t = timestep
+        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+        # 1. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 2. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample = (sample - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+
+        # 3. Clip "predicted x_0"
+        if self.config.clip_sample:
+            pred_original_sample = torch.clamp(pred_original_sample, -1, 1)
+
+        # We choose to follow RePaint Algorithm 1 to get x_{t-1}, however we
+        # substitute formula (7) in the algorithm coming from DDPM paper
+        # (formula (4) Algorithm 2 - Sampling) with formula (12) from DDIM paper.
+        # DDIM schedule gives the same results as DDPM with eta = 1.0
+        # Noise is being reused in 7. and 8., but no impact on quality has
+        # been observed.
+
+        # 5. Add noise
+        noise = torch.randn(
+            model_output.shape, dtype=model_output.dtype, generator=generator, device=model_output.device
+        )
+        std_dev_t = self.eta * self._get_variance(timestep) ** 0.5
+
+        variance = 0
+        if t > 0 and self.eta > 0:
+            variance = std_dev_t * noise
+
+        # 6. compute "direction pointing to x_t" of formula (12)
+        # from https://arxiv.org/pdf/2010.02502.pdf
+        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** 0.5 * model_output
+
+        # 7. compute x_{t-1} of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        prev_unknown_part = alpha_prod_t_prev**0.5 * pred_original_sample + pred_sample_direction + variance
+
+        # 8. Algorithm 1 Line 5 https://arxiv.org/pdf/2201.09865.pdf
+        prev_known_part = (alpha_prod_t**0.5) * original_image + ((1 - alpha_prod_t) ** 0.5) * noise
+
+        # 9. Algorithm 1 Line 8 https://arxiv.org/pdf/2201.09865.pdf
+        pred_prev_sample = mask * prev_known_part + (1.0 - mask) * prev_unknown_part
+
+        if not return_dict:
+            return (
+                pred_prev_sample,
+                pred_original_sample,
+            )
+
+        return RePaintSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
+
+    def undo_step(self, sample, timestep, generator=None):
+        n = self.config.num_train_timesteps // self.num_inference_steps
+
+        for i in range(n):
+            beta = self.betas[timestep + i]
+            noise = torch.randn(sample.shape, generator=generator, device=sample.device)
+
+            # 10. Algorithm 1 Line 10 https://arxiv.org/pdf/2201.09865.pdf
+            sample = (1 - beta) ** 0.5 * sample + beta**0.5 * noise
+
+        return sample
+
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        raise NotImplementedError("Use `DDPMScheduler.add_noise()` to train for sampling with RePaint.")
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 5dd583279708..63aa20962f05 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -227,6 +227,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class RePaintPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class ScoreSdeVePipeline(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -347,6 +362,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class RePaintScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class SchedulerMixin(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/pipelines/repaint/__init__.py b/tests/pipelines/repaint/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/repaint/test_repaint.py b/tests/pipelines/repaint/test_repaint.py
new file mode 100644
index 000000000000..23544dfd2468
--- /dev/null
+++ b/tests/pipelines/repaint/test_repaint.py
@@ -0,0 +1,65 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import RePaintPipeline, RePaintScheduler, UNet2DModel
+from diffusers.utils.testing_utils import load_image, require_torch_gpu, slow, torch_device
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+@slow
+@require_torch_gpu
+class RepaintPipelineIntegrationTests(unittest.TestCase):
+    def test_celebahq(self):
+        original_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/"
+            "repaint/celeba_hq_256.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/mask_256.png"
+        )
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/"
+            "repaint/celeba_hq_256_result.png"
+        )
+        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+
+        model_id = "google/ddpm-ema-celebahq-256"
+        unet = UNet2DModel.from_pretrained(model_id)
+        scheduler = RePaintScheduler.from_config(model_id)
+
+        repaint = RePaintPipeline(unet=unet, scheduler=scheduler).to(torch_device)
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = repaint(
+            original_image,
+            mask_image,
+            num_inference_steps=250,
+            eta=0.0,
+            jump_length=10,
+            jump_n_sample=10,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (256, 256, 3)
+        assert np.abs(expected_image - image).mean() < 1e-2

From 269109dbfbbdbe2800535239b881e96e1828a0ef Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Thu, 3 Nov 2022 15:49:20 +0100
Subject: [PATCH 116/269] Continuation of #1035 (#1120)

* remove batch size from repeat

* repeat empty string if uncond_tokens is none

* fix inpaint pipes

* return back whitespace to pass code quality

* Apply suggestions from code review

* Fix typos.

Co-authored-by: Had <had-95@yandex.ru>
---
 examples/community/interpolate_stable_diffusion.py          | 4 ++--
 examples/community/speech_to_image_diffusion.py             | 4 ++--
 examples/community/wildcard_stable_diffusion.py             | 4 ++--
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py | 4 ++--
 .../stable_diffusion/pipeline_stable_diffusion_img2img.py   | 6 ++++--
 .../stable_diffusion/pipeline_stable_diffusion_inpaint.py   | 4 ++--
 .../pipeline_stable_diffusion_inpaint_legacy.py             | 6 ++++--
 7 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/examples/community/interpolate_stable_diffusion.py b/examples/community/interpolate_stable_diffusion.py
index bbb1b0f9e633..de1c6f687aa8 100644
--- a/examples/community/interpolate_stable_diffusion.py
+++ b/examples/community/interpolate_stable_diffusion.py
@@ -278,7 +278,7 @@ def __call__(
         if do_classifier_free_guidance:
             uncond_tokens: List[str]
             if negative_prompt is None:
-                uncond_tokens = [""]
+                uncond_tokens = [""] * batch_size
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
@@ -307,7 +307,7 @@ def __call__(
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
             uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
 
             # For classifier free guidance, we need to do two forward passes.
diff --git a/examples/community/speech_to_image_diffusion.py b/examples/community/speech_to_image_diffusion.py
index 8b24914cb78f..1a9d296e81f3 100644
--- a/examples/community/speech_to_image_diffusion.py
+++ b/examples/community/speech_to_image_diffusion.py
@@ -148,7 +148,7 @@ def __call__(
         if do_classifier_free_guidance:
             uncond_tokens: List[str]
             if negative_prompt is None:
-                uncond_tokens = [""]
+                uncond_tokens = [""] * batch_size
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
@@ -177,7 +177,7 @@ def __call__(
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
             uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
 
             # For classifier free guidance, we need to do two forward passes.
diff --git a/examples/community/wildcard_stable_diffusion.py b/examples/community/wildcard_stable_diffusion.py
index 79cb4feb1e7c..b0f6375d50c6 100644
--- a/examples/community/wildcard_stable_diffusion.py
+++ b/examples/community/wildcard_stable_diffusion.py
@@ -295,7 +295,7 @@ def __call__(
         if do_classifier_free_guidance:
             uncond_tokens: List[str]
             if negative_prompt is None:
-                uncond_tokens = [""]
+                uncond_tokens = [""] * batch_size
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
@@ -324,7 +324,7 @@ def __call__(
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
             uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
 
             # For classifier free guidance, we need to do two forward passes.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 3c1eb734a49d..1ccc87804e68 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -297,7 +297,7 @@ def __call__(
         if do_classifier_free_guidance:
             uncond_tokens: List[str]
             if negative_prompt is None:
-                uncond_tokens = [""]
+                uncond_tokens = [""] * batch_size
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
@@ -326,7 +326,7 @@ def __call__(
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
             uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
 
             # For classifier free guidance, we need to do two forward passes.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index e61fb27acc1e..8284bac85091 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -295,7 +295,7 @@ def __call__(
         if do_classifier_free_guidance:
             uncond_tokens: List[str]
             if negative_prompt is None:
-                uncond_tokens = [""]
+                uncond_tokens = [""] * batch_size
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
@@ -319,7 +319,9 @@ def __call__(
             uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
 
             # duplicate unconditional embeddings for each generation per prompt
-            uncond_embeddings = uncond_embeddings.repeat_interleave(batch_size * num_images_per_prompt, dim=0)
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index bbe6ee60832c..c200892ef60a 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -302,7 +302,7 @@ def __call__(
         if do_classifier_free_guidance:
             uncond_tokens: List[str]
             if negative_prompt is None:
-                uncond_tokens = [""]
+                uncond_tokens = [""] * batch_size
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
@@ -331,7 +331,7 @@ def __call__(
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
             uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
 
             # For classifier free guidance, we need to do two forward passes.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 6db3624177c6..5c06b74bfa38 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -284,7 +284,7 @@ def __call__(
         if do_classifier_free_guidance:
             uncond_tokens: List[str]
             if negative_prompt is None:
-                uncond_tokens = [""]
+                uncond_tokens = [""] * batch_size
             elif type(prompt) is not type(negative_prompt):
                 raise TypeError(
                     f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
@@ -312,7 +312,9 @@ def __call__(
             uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
 
             # duplicate unconditional embeddings for each generation per prompt
-            uncond_embeddings = uncond_embeddings.repeat_interleave(batch_size * num_images_per_prompt, dim=0)
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
 
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch

From ef2ea33c3bc061fffa8bc4ccd640306ca1a1847d Mon Sep 17 00:00:00 2001
From: Will Berman <wlbberman@gmail.com>
Date: Thu, 3 Nov 2022 08:10:28 -0700
Subject: [PATCH 117/269] VQ-diffusion (#658)

* Changes for VQ-diffusion VQVAE

Add specify dimension of embeddings to VQModel:
`VQModel` will by default set the dimension of embeddings to the number
of latent channels. The VQ-diffusion VQVAE has a smaller
embedding dimension, 128, than number of latent channels, 256.

Add AttnDownEncoderBlock2D and AttnUpDecoderBlock2D to the up and down
unet block helpers. VQ-diffusion's VQVAE uses those two block types.

* Changes for VQ-diffusion transformer

Modify attention.py so SpatialTransformer can be used for
VQ-diffusion's transformer.

SpatialTransformer:
- Can now operate over discrete inputs (classes of vector embeddings) as well as continuous.
- `in_channels` was made optional in the constructor so two locations where it was passed as a positional arg were moved to kwargs
- modified forward pass to take optional timestep embeddings

ImagePositionalEmbeddings:
- added to provide positional embeddings to discrete inputs for latent pixels

BasicTransformerBlock:
- norm layers were made configurable so that the VQ-diffusion could use AdaLayerNorm with timestep embeddings
- modified forward pass to take optional timestep embeddings

CrossAttention:
- now may optionally take a bias parameter for its query, key, and value linear layers

FeedForward:
- Internal layers are now configurable

ApproximateGELU:
- Activation function in VQ-diffusion's feedforward layer

AdaLayerNorm:
- Norm layer modified to incorporate timestep embeddings

* Add VQ-diffusion scheduler

* Add VQ-diffusion pipeline

* Add VQ-diffusion convert script to diffusers

* Add VQ-diffusion dummy objects

* Add VQ-diffusion markdown docs

* Add VQ-diffusion tests

* some renaming

* some fixes

* more renaming

* correct

* fix typo

* correct weights

* finalize

* fix tests

* Apply suggestions from code review

Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>

* Apply suggestions from code review

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

* finish

* finish

* up

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>
Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 docs/source/_toctree.yml                      |   2 +
 docs/source/api/models.mdx                    |   6 +
 docs/source/api/pipelines/overview.mdx        |  28 +-
 docs/source/api/pipelines/vq_diffusion.mdx    |  34 +
 docs/source/api/schedulers.mdx                |   9 +-
 docs/source/index.mdx                         |   2 +
 scripts/convert_vq_diffusion_to_diffusers.py  | 885 ++++++++++++++++++
 src/diffusers/__init__.py                     |   4 +-
 src/diffusers/models/__init__.py              |   1 +
 src/diffusers/models/attention.py             | 444 ++++++---
 src/diffusers/models/attention_flax.py        |   2 +-
 src/diffusers/models/embeddings.py            |  65 ++
 src/diffusers/models/unet_2d_blocks.py        |  94 +-
 src/diffusers/models/unet_2d_blocks_flax.py   |   8 +-
 src/diffusers/models/vae.py                   |  22 +-
 src/diffusers/pipelines/__init__.py           |   1 +
 .../pipelines/vq_diffusion/__init__.py        |   1 +
 .../vq_diffusion/pipeline_vq_diffusion.py     | 253 +++++
 src/diffusers/schedulers/__init__.py          |   1 +
 .../schedulers/scheduling_vq_diffusion.py     | 494 ++++++++++
 src/diffusers/utils/dummy_pt_objects.py       |  45 +
 tests/pipelines/vq_diffusion/__init__.py      |   0
 .../vq_diffusion/test_vq_diffusion.py         | 175 ++++
 tests/test_layers_utils.py                    | 183 +++-
 tests/test_scheduler.py                       | 138 ++-
 25 files changed, 2674 insertions(+), 223 deletions(-)
 create mode 100644 docs/source/api/pipelines/vq_diffusion.mdx
 create mode 100644 scripts/convert_vq_diffusion_to_diffusers.py
 create mode 100644 src/diffusers/pipelines/vq_diffusion/__init__.py
 create mode 100644 src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
 create mode 100644 src/diffusers/schedulers/scheduling_vq_diffusion.py
 create mode 100644 tests/pipelines/vq_diffusion/__init__.py
 create mode 100644 tests/pipelines/vq_diffusion/test_vq_diffusion.py

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 331d4fff7870..70d64b80de7a 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -96,6 +96,8 @@
       title: "Stochastic Karras VE"
     - local: api/pipelines/dance_diffusion
       title: "Dance Diffusion"
+    - local: api/pipelines/vq_diffusion
+      title: "VQ Diffusion"
     - local: api/pipelines/repaint
       title: "RePaint"
     title: "Pipelines"
diff --git a/docs/source/api/models.mdx b/docs/source/api/models.mdx
index c3f5e65edfbd..2e1e8798a7ef 100644
--- a/docs/source/api/models.mdx
+++ b/docs/source/api/models.mdx
@@ -49,6 +49,12 @@ The models are built on the base class ['ModelMixin'] that is a `torch.nn.module
 ## AutoencoderKL
 [[autodoc]] AutoencoderKL
 
+## Transformer2DModel
+[[autodoc]] Transformer2DModel
+
+## Transformer2DModelOutput
+[[autodoc]] models.attention.Transformer2DModelOutput
+
 ## FlaxModelMixin
 [[autodoc]] FlaxModelMixin
 
diff --git a/docs/source/api/pipelines/overview.mdx b/docs/source/api/pipelines/overview.mdx
index a53a2f8b4137..5a15473cf19c 100644
--- a/docs/source/api/pipelines/overview.mdx
+++ b/docs/source/api/pipelines/overview.mdx
@@ -41,22 +41,22 @@ If you are looking for *official* training examples, please have a look at [exam
 The following table summarizes all officially supported pipelines, their corresponding paper, and if 
 available a colab notebook to directly try them out.
 
-| Pipeline                                             | Paper                                                                                                                        |                 Tasks                 | Colab
-|------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------:|:---:|
-| [ddpm](./ddpm)                                       | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239)                                             |    Unconditional Image Generation     |
-| [ddim](./ddim)                                       | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502)                                                  |    Unconditional Image Generation     | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
-| [latent_diffusion](./latent_diffusion)               | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)                         |       Text-to-Image Generation        | 
-| [latent_diffusion_uncond](./latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)                         |    Unconditional Image Generation     | 
-| [pndm](./pndm)                                       | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778)                           |    Unconditional Image Generation     | 
-| [score_sde_ve](./score_sde_ve)                       | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) |    Unconditional Image Generation     | 
-| [score_sde_vp](./score_sde_vp)                       | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) |    Unconditional Image Generation     | 
-| [stable_diffusion](./stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            |       Text-to-Image Generation        | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
-| [stable_diffusion](./stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
-| [stable_diffusion](./stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            |     Text-Guided Image Inpainting      | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
-| [stochastic_karras_ve](./stochastic_karras_ve)       | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364)                    |    Unconditional Image Generation     | 
+| Pipeline | Paper | Tasks | Colab
+|---|---|:---:|:---:|
+| [ddpm](./ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
+| [ddim](./ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [latent_diffusion](./latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | 
+| [latent_diffusion_uncond](./latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | 
+| [pndm](./pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation | 
+| [score_sde_ve](./score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
+| [score_sde_vp](./score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
+| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+| [stochastic_karras_ve](./stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation | 
+| [vq_diffusion](./vq_diffusion) | [**Vector Quantized Diffusion Model for Text-to-Image Synthesis**](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation |
 | [repaint](./repaint)                                 | [**RePaint: Inpainting using Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2201.09865)                   |           Image Inpainting            | 
 
-
 **Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers. 
 
 However, most of them can be adapted to use different scheduler components or even different model components. Some pipeline examples are shown in the [Examples](#examples) below.
diff --git a/docs/source/api/pipelines/vq_diffusion.mdx b/docs/source/api/pipelines/vq_diffusion.mdx
new file mode 100644
index 000000000000..92cc903eee79
--- /dev/null
+++ b/docs/source/api/pipelines/vq_diffusion.mdx
@@ -0,0 +1,34 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# VQDiffusion
+
+## Overview
+
+[Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) by Shuyang Gu, Dong Chen, Jianmin Bao, Fang Wen, Bo Zhang, Dongdong Chen, Lu Yuan, Baining Guo
+
+The abstract of the paper is the following:
+
+We present the vector quantized diffusion (VQ-Diffusion) model for text-to-image generation. This method is based on a vector quantized variational autoencoder (VQ-VAE) whose latent space is modeled by a conditional variant of the recently developed Denoising Diffusion Probabilistic Model (DDPM). We find that this latent-space method is well-suited for text-to-image generation tasks because it not only eliminates the unidirectional bias with existing methods but also allows us to incorporate a mask-and-replace diffusion strategy to avoid the accumulation of errors, which is a serious problem with existing methods. Our experiments show that the VQ-Diffusion produces significantly better text-to-image generation results when compared with conventional autoregressive (AR) models with similar numbers of parameters. Compared with previous GAN-based text-to-image methods, our VQ-Diffusion can handle more complex scenes and improve the synthesized image quality by a large margin. Finally, we show that the image generation computation in our method can be made highly efficient by reparameterization. With traditional AR methods, the text-to-image generation time increases linearly with the output image resolution and hence is quite time consuming even for normal size images. The VQ-Diffusion allows us to achieve a better trade-off between quality and speed. Our experiments indicate that the VQ-Diffusion model with the reparameterization is fifteen times faster than traditional AR methods while achieving a better image quality.
+
+The original codebase can be found [here](https://github.com/microsoft/VQ-Diffusion).
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Colab
+|---|---|:---:|
+| [pipeline_vq_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py) | *Text-to-Image Generation* | - |
+
+
+## VQDiffusionPipeline
+[[autodoc]] pipelines.vq_diffusion.pipeline_vq_diffusion.VQDiffusionPipeline
+    - __call__
diff --git a/docs/source/api/schedulers.mdx b/docs/source/api/schedulers.mdx
index 6e7da10e33e5..f073f6b37912 100644
--- a/docs/source/api/schedulers.mdx
+++ b/docs/source/api/schedulers.mdx
@@ -113,7 +113,6 @@ Score SDE-VP is under construction.
 
 [[autodoc]] schedulers.scheduling_sde_vp.ScoreSdeVpScheduler
 
-
 #### Euler scheduler
 
 Euler scheduler (Algorithm 2) from the paper [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364) by Karras et al. (2022). Based on the original [k-diffusion](https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L51) implementation by Katherine Crowson.
@@ -130,6 +129,12 @@ Fast scheduler which often times generates good outputs with 20-30 steps.
 [[autodoc]] EulerAncestralDiscreteScheduler
 
 
+#### VQDiffusionScheduler
+
+Original paper can be found [here](https://arxiv.org/abs/2111.14822)
+
+[[autodoc]] VQDiffusionScheduler
+
 #### RePaint scheduler
 
 DDPM-based inpainting scheduler for unsupervised inpainting with extreme masks. 
@@ -137,4 +142,4 @@ Intended for use with [`RePaintPipeline`].
 Based on the paper [RePaint: Inpainting using Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2201.09865) 
 and the original implementation by Andreas Lugmayr et al.: https://github.com/andreas128/RePaint
 
-[[autodoc]] RePaintScheduler
\ No newline at end of file
+[[autodoc]] RePaintScheduler
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 392b22399908..62a3e88f173d 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -34,6 +34,7 @@ available a colab notebook to directly try them out.
 
 | Pipeline | Paper | Tasks | Colab
 |---|---|:---:|:---:|
+| [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
 | [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
 | [ddim](./api/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
 | [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | 
@@ -45,5 +46,6 @@ available a colab notebook to directly try them out.
 | [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
 | [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
 | [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation | 
+| [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation | 
 
 **Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers. 
diff --git a/scripts/convert_vq_diffusion_to_diffusers.py b/scripts/convert_vq_diffusion_to_diffusers.py
new file mode 100644
index 000000000000..ae105e30362e
--- /dev/null
+++ b/scripts/convert_vq_diffusion_to_diffusers.py
@@ -0,0 +1,885 @@
+"""
+This script ports models from VQ-diffusion (https://github.com/microsoft/VQ-Diffusion) to diffusers.
+
+It currently only supports porting the ITHQ dataset.
+
+ITHQ dataset:
+```sh
+# From the root directory of diffusers.
+
+# Download the VQVAE checkpoint
+$ wget https://facevcstandard.blob.core.windows.net/v-zhictang/Improved-VQ-Diffusion_model_release/ithq_vqvae.pth?sv=2020-10-02&st=2022-05-30T15%3A17%3A18Z&se=2030-05-31T15%3A17%3A00Z&sr=b&sp=r&sig=1jVavHFPpUjDs%2FTO1V3PTezaNbPp2Nx8MxiWI7y6fEY%3D -O ithq_vqvae.pth
+
+# Download the VQVAE config
+# NOTE that in VQ-diffusion the documented file is `configs/ithq.yaml` but the target class
+# `image_synthesis.modeling.codecs.image_codec.ema_vqvae.PatchVQVAE`
+# loads `OUTPUT/pretrained_model/taming_dvae/config.yaml`
+$ wget https://raw.githubusercontent.com/microsoft/VQ-Diffusion/main/OUTPUT/pretrained_model/taming_dvae/config.yaml -O ithq_vqvae.yaml
+
+# Download the main model checkpoint
+$ wget https://facevcstandard.blob.core.windows.net/v-zhictang/Improved-VQ-Diffusion_model_release/ithq_learnable.pth?sv=2020-10-02&st=2022-05-30T10%3A22%3A06Z&se=2030-05-31T10%3A22%3A00Z&sr=b&sp=r&sig=GOE%2Bza02%2FPnGxYVOOPtwrTR4RA3%2F5NVgMxdW4kjaEZ8%3D -O ithq_learnable.pth
+
+# Download the main model config
+$ wget https://raw.githubusercontent.com/microsoft/VQ-Diffusion/main/configs/ithq.yaml -O ithq.yaml
+
+# run the convert script
+$ python ./scripts/convert_vq_diffusion_to_diffusers.py \
+    --checkpoint_path ./ithq_learnable.pth \
+    --original_config_file ./ithq.yaml \
+    --vqvae_checkpoint_path ./ithq_vqvae.pth \
+    --vqvae_original_config_file ./ithq_vqvae.yaml \
+    --dump_path <path to save pre-trained `VQDiffusionPipeline`>
+```
+"""
+
+import argparse
+import tempfile
+
+import torch
+
+import yaml
+from accelerate import init_empty_weights, load_checkpoint_and_dispatch
+from diffusers import VQDiffusionPipeline, VQDiffusionScheduler, VQModel
+from diffusers.models.attention import Transformer2DModel
+from transformers import CLIPTextModel, CLIPTokenizer
+from yaml.loader import FullLoader
+
+
+try:
+    from omegaconf import OmegaConf
+except ImportError:
+    raise ImportError(
+        "OmegaConf is required to convert the VQ Diffusion checkpoints. Please install it with `pip install"
+        " OmegaConf`."
+    )
+
+# vqvae model
+
+PORTED_VQVAES = ["image_synthesis.modeling.codecs.image_codec.patch_vqgan.PatchVQGAN"]
+
+
+def vqvae_model_from_original_config(original_config):
+    assert original_config.target in PORTED_VQVAES, f"{original_config.target} has not yet been ported to diffusers."
+
+    original_config = original_config.params
+
+    original_encoder_config = original_config.encoder_config.params
+    original_decoder_config = original_config.decoder_config.params
+
+    in_channels = original_encoder_config.in_channels
+    out_channels = original_decoder_config.out_ch
+
+    down_block_types = get_down_block_types(original_encoder_config)
+    up_block_types = get_up_block_types(original_decoder_config)
+
+    assert original_encoder_config.ch == original_decoder_config.ch
+    assert original_encoder_config.ch_mult == original_decoder_config.ch_mult
+    block_out_channels = tuple(
+        [original_encoder_config.ch * a_ch_mult for a_ch_mult in original_encoder_config.ch_mult]
+    )
+
+    assert original_encoder_config.num_res_blocks == original_decoder_config.num_res_blocks
+    layers_per_block = original_encoder_config.num_res_blocks
+
+    assert original_encoder_config.z_channels == original_decoder_config.z_channels
+    latent_channels = original_encoder_config.z_channels
+
+    num_vq_embeddings = original_config.n_embed
+
+    # Hard coded value for ResnetBlock.GoupNorm(num_groups) in VQ-diffusion
+    norm_num_groups = 32
+
+    e_dim = original_config.embed_dim
+
+    model = VQModel(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        down_block_types=down_block_types,
+        up_block_types=up_block_types,
+        block_out_channels=block_out_channels,
+        layers_per_block=layers_per_block,
+        latent_channels=latent_channels,
+        num_vq_embeddings=num_vq_embeddings,
+        norm_num_groups=norm_num_groups,
+        vq_embed_dim=e_dim,
+    )
+
+    return model
+
+
+def get_down_block_types(original_encoder_config):
+    attn_resolutions = coerce_attn_resolutions(original_encoder_config.attn_resolutions)
+    num_resolutions = len(original_encoder_config.ch_mult)
+    resolution = coerce_resolution(original_encoder_config.resolution)
+
+    curr_res = resolution
+    down_block_types = []
+
+    for _ in range(num_resolutions):
+        if curr_res in attn_resolutions:
+            down_block_type = "AttnDownEncoderBlock2D"
+        else:
+            down_block_type = "DownEncoderBlock2D"
+
+        down_block_types.append(down_block_type)
+
+        curr_res = [r // 2 for r in curr_res]
+
+    return down_block_types
+
+
+def get_up_block_types(original_decoder_config):
+    attn_resolutions = coerce_attn_resolutions(original_decoder_config.attn_resolutions)
+    num_resolutions = len(original_decoder_config.ch_mult)
+    resolution = coerce_resolution(original_decoder_config.resolution)
+
+    curr_res = [r // 2 ** (num_resolutions - 1) for r in resolution]
+    up_block_types = []
+
+    for _ in reversed(range(num_resolutions)):
+        if curr_res in attn_resolutions:
+            up_block_type = "AttnUpDecoderBlock2D"
+        else:
+            up_block_type = "UpDecoderBlock2D"
+
+        up_block_types.append(up_block_type)
+
+        curr_res = [r * 2 for r in curr_res]
+
+    return up_block_types
+
+
+def coerce_attn_resolutions(attn_resolutions):
+    attn_resolutions = OmegaConf.to_object(attn_resolutions)
+    attn_resolutions_ = []
+    for ar in attn_resolutions:
+        if isinstance(ar, (list, tuple)):
+            attn_resolutions_.append(list(ar))
+        else:
+            attn_resolutions_.append([ar, ar])
+    return attn_resolutions_
+
+
+def coerce_resolution(resolution):
+    resolution = OmegaConf.to_object(resolution)
+    if isinstance(resolution, int):
+        resolution = [resolution, resolution]  # H, W
+    elif isinstance(resolution, (tuple, list)):
+        resolution = list(resolution)
+    else:
+        raise ValueError("Unknown type of resolution:", resolution)
+    return resolution
+
+
+# done vqvae model
+
+# vqvae checkpoint
+
+
+def vqvae_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    diffusers_checkpoint.update(vqvae_encoder_to_diffusers_checkpoint(model, checkpoint))
+
+    # quant_conv
+
+    diffusers_checkpoint.update(
+        {
+            "quant_conv.weight": checkpoint["quant_conv.weight"],
+            "quant_conv.bias": checkpoint["quant_conv.bias"],
+        }
+    )
+
+    # quantize
+    diffusers_checkpoint.update({"quantize.embedding.weight": checkpoint["quantize.embedding"]})
+
+    # post_quant_conv
+    diffusers_checkpoint.update(
+        {
+            "post_quant_conv.weight": checkpoint["post_quant_conv.weight"],
+            "post_quant_conv.bias": checkpoint["post_quant_conv.bias"],
+        }
+    )
+
+    # decoder
+    diffusers_checkpoint.update(vqvae_decoder_to_diffusers_checkpoint(model, checkpoint))
+
+    return diffusers_checkpoint
+
+
+def vqvae_encoder_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    # conv_in
+    diffusers_checkpoint.update(
+        {
+            "encoder.conv_in.weight": checkpoint["encoder.conv_in.weight"],
+            "encoder.conv_in.bias": checkpoint["encoder.conv_in.bias"],
+        }
+    )
+
+    # down_blocks
+    for down_block_idx, down_block in enumerate(model.encoder.down_blocks):
+        diffusers_down_block_prefix = f"encoder.down_blocks.{down_block_idx}"
+        down_block_prefix = f"encoder.down.{down_block_idx}"
+
+        # resnets
+        for resnet_idx, resnet in enumerate(down_block.resnets):
+            diffusers_resnet_prefix = f"{diffusers_down_block_prefix}.resnets.{resnet_idx}"
+            resnet_prefix = f"{down_block_prefix}.block.{resnet_idx}"
+
+            diffusers_checkpoint.update(
+                vqvae_resnet_to_diffusers_checkpoint(
+                    resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+                )
+            )
+
+        # downsample
+
+        # do not include the downsample when on the last down block
+        # There is no downsample on the last down block
+        if down_block_idx != len(model.encoder.down_blocks) - 1:
+            # There's a single downsample in the original checkpoint but a list of downsamples
+            # in the diffusers model.
+            diffusers_downsample_prefix = f"{diffusers_down_block_prefix}.downsamplers.0.conv"
+            downsample_prefix = f"{down_block_prefix}.downsample.conv"
+            diffusers_checkpoint.update(
+                {
+                    f"{diffusers_downsample_prefix}.weight": checkpoint[f"{downsample_prefix}.weight"],
+                    f"{diffusers_downsample_prefix}.bias": checkpoint[f"{downsample_prefix}.bias"],
+                }
+            )
+
+        # attentions
+
+        if hasattr(down_block, "attentions"):
+            for attention_idx, _ in enumerate(down_block.attentions):
+                diffusers_attention_prefix = f"{diffusers_down_block_prefix}.attentions.{attention_idx}"
+                attention_prefix = f"{down_block_prefix}.attn.{attention_idx}"
+                diffusers_checkpoint.update(
+                    vqvae_attention_to_diffusers_checkpoint(
+                        checkpoint,
+                        diffusers_attention_prefix=diffusers_attention_prefix,
+                        attention_prefix=attention_prefix,
+                    )
+                )
+
+    # mid block
+
+    # mid block attentions
+
+    # There is a single hardcoded attention block in the middle of the VQ-diffusion encoder
+    diffusers_attention_prefix = "encoder.mid_block.attentions.0"
+    attention_prefix = "encoder.mid.attn_1"
+    diffusers_checkpoint.update(
+        vqvae_attention_to_diffusers_checkpoint(
+            checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
+        )
+    )
+
+    # mid block resnets
+
+    for diffusers_resnet_idx, resnet in enumerate(model.encoder.mid_block.resnets):
+        diffusers_resnet_prefix = f"encoder.mid_block.resnets.{diffusers_resnet_idx}"
+
+        # the hardcoded prefixes to `block_` are 1 and 2
+        orig_resnet_idx = diffusers_resnet_idx + 1
+        # There are two hardcoded resnets in the middle of the VQ-diffusion encoder
+        resnet_prefix = f"encoder.mid.block_{orig_resnet_idx}"
+
+        diffusers_checkpoint.update(
+            vqvae_resnet_to_diffusers_checkpoint(
+                resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+            )
+        )
+
+    diffusers_checkpoint.update(
+        {
+            # conv_norm_out
+            "encoder.conv_norm_out.weight": checkpoint["encoder.norm_out.weight"],
+            "encoder.conv_norm_out.bias": checkpoint["encoder.norm_out.bias"],
+            # conv_out
+            "encoder.conv_out.weight": checkpoint["encoder.conv_out.weight"],
+            "encoder.conv_out.bias": checkpoint["encoder.conv_out.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def vqvae_decoder_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    # conv in
+    diffusers_checkpoint.update(
+        {
+            "decoder.conv_in.weight": checkpoint["decoder.conv_in.weight"],
+            "decoder.conv_in.bias": checkpoint["decoder.conv_in.bias"],
+        }
+    )
+
+    # up_blocks
+
+    for diffusers_up_block_idx, up_block in enumerate(model.decoder.up_blocks):
+        # up_blocks are stored in reverse order in the VQ-diffusion checkpoint
+        orig_up_block_idx = len(model.decoder.up_blocks) - 1 - diffusers_up_block_idx
+
+        diffusers_up_block_prefix = f"decoder.up_blocks.{diffusers_up_block_idx}"
+        up_block_prefix = f"decoder.up.{orig_up_block_idx}"
+
+        # resnets
+        for resnet_idx, resnet in enumerate(up_block.resnets):
+            diffusers_resnet_prefix = f"{diffusers_up_block_prefix}.resnets.{resnet_idx}"
+            resnet_prefix = f"{up_block_prefix}.block.{resnet_idx}"
+
+            diffusers_checkpoint.update(
+                vqvae_resnet_to_diffusers_checkpoint(
+                    resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+                )
+            )
+
+        # upsample
+
+        # there is no up sample on the last up block
+        if diffusers_up_block_idx != len(model.decoder.up_blocks) - 1:
+            # There's a single upsample in the VQ-diffusion checkpoint but a list of downsamples
+            # in the diffusers model.
+            diffusers_downsample_prefix = f"{diffusers_up_block_prefix}.upsamplers.0.conv"
+            downsample_prefix = f"{up_block_prefix}.upsample.conv"
+            diffusers_checkpoint.update(
+                {
+                    f"{diffusers_downsample_prefix}.weight": checkpoint[f"{downsample_prefix}.weight"],
+                    f"{diffusers_downsample_prefix}.bias": checkpoint[f"{downsample_prefix}.bias"],
+                }
+            )
+
+        # attentions
+
+        if hasattr(up_block, "attentions"):
+            for attention_idx, _ in enumerate(up_block.attentions):
+                diffusers_attention_prefix = f"{diffusers_up_block_prefix}.attentions.{attention_idx}"
+                attention_prefix = f"{up_block_prefix}.attn.{attention_idx}"
+                diffusers_checkpoint.update(
+                    vqvae_attention_to_diffusers_checkpoint(
+                        checkpoint,
+                        diffusers_attention_prefix=diffusers_attention_prefix,
+                        attention_prefix=attention_prefix,
+                    )
+                )
+
+    # mid block
+
+    # mid block attentions
+
+    # There is a single hardcoded attention block in the middle of the VQ-diffusion decoder
+    diffusers_attention_prefix = "decoder.mid_block.attentions.0"
+    attention_prefix = "decoder.mid.attn_1"
+    diffusers_checkpoint.update(
+        vqvae_attention_to_diffusers_checkpoint(
+            checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
+        )
+    )
+
+    # mid block resnets
+
+    for diffusers_resnet_idx, resnet in enumerate(model.encoder.mid_block.resnets):
+        diffusers_resnet_prefix = f"decoder.mid_block.resnets.{diffusers_resnet_idx}"
+
+        # the hardcoded prefixes to `block_` are 1 and 2
+        orig_resnet_idx = diffusers_resnet_idx + 1
+        # There are two hardcoded resnets in the middle of the VQ-diffusion decoder
+        resnet_prefix = f"decoder.mid.block_{orig_resnet_idx}"
+
+        diffusers_checkpoint.update(
+            vqvae_resnet_to_diffusers_checkpoint(
+                resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix
+            )
+        )
+
+    diffusers_checkpoint.update(
+        {
+            # conv_norm_out
+            "decoder.conv_norm_out.weight": checkpoint["decoder.norm_out.weight"],
+            "decoder.conv_norm_out.bias": checkpoint["decoder.norm_out.bias"],
+            # conv_out
+            "decoder.conv_out.weight": checkpoint["decoder.conv_out.weight"],
+            "decoder.conv_out.bias": checkpoint["decoder.conv_out.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def vqvae_resnet_to_diffusers_checkpoint(resnet, checkpoint, *, diffusers_resnet_prefix, resnet_prefix):
+    rv = {
+        # norm1
+        f"{diffusers_resnet_prefix}.norm1.weight": checkpoint[f"{resnet_prefix}.norm1.weight"],
+        f"{diffusers_resnet_prefix}.norm1.bias": checkpoint[f"{resnet_prefix}.norm1.bias"],
+        # conv1
+        f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.conv1.weight"],
+        f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.conv1.bias"],
+        # norm2
+        f"{diffusers_resnet_prefix}.norm2.weight": checkpoint[f"{resnet_prefix}.norm2.weight"],
+        f"{diffusers_resnet_prefix}.norm2.bias": checkpoint[f"{resnet_prefix}.norm2.bias"],
+        # conv2
+        f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.conv2.weight"],
+        f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.conv2.bias"],
+    }
+
+    if resnet.conv_shortcut is not None:
+        rv.update(
+            {
+                f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{resnet_prefix}.nin_shortcut.weight"],
+                f"{diffusers_resnet_prefix}.conv_shortcut.bias": checkpoint[f"{resnet_prefix}.nin_shortcut.bias"],
+            }
+        )
+
+    return rv
+
+
+def vqvae_attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix):
+    return {
+        # group_norm
+        f"{diffusers_attention_prefix}.group_norm.weight": checkpoint[f"{attention_prefix}.norm.weight"],
+        f"{diffusers_attention_prefix}.group_norm.bias": checkpoint[f"{attention_prefix}.norm.bias"],
+        # query
+        f"{diffusers_attention_prefix}.query.weight": checkpoint[f"{attention_prefix}.q.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.query.bias": checkpoint[f"{attention_prefix}.q.bias"],
+        # key
+        f"{diffusers_attention_prefix}.key.weight": checkpoint[f"{attention_prefix}.k.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.key.bias": checkpoint[f"{attention_prefix}.k.bias"],
+        # value
+        f"{diffusers_attention_prefix}.value.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0],
+        f"{diffusers_attention_prefix}.value.bias": checkpoint[f"{attention_prefix}.v.bias"],
+        # proj_attn
+        f"{diffusers_attention_prefix}.proj_attn.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][
+            :, :, 0, 0
+        ],
+        f"{diffusers_attention_prefix}.proj_attn.bias": checkpoint[f"{attention_prefix}.proj_out.bias"],
+    }
+
+
+# done vqvae checkpoint
+
+# transformer model
+
+PORTED_DIFFUSIONS = ["image_synthesis.modeling.transformers.diffusion_transformer.DiffusionTransformer"]
+PORTED_TRANSFORMERS = ["image_synthesis.modeling.transformers.transformer_utils.Text2ImageTransformer"]
+PORTED_CONTENT_EMBEDDINGS = ["image_synthesis.modeling.embeddings.dalle_mask_image_embedding.DalleMaskImageEmbedding"]
+
+
+def transformer_model_from_original_config(
+    original_diffusion_config, original_transformer_config, original_content_embedding_config
+):
+    assert (
+        original_diffusion_config.target in PORTED_DIFFUSIONS
+    ), f"{original_diffusion_config.target} has not yet been ported to diffusers."
+    assert (
+        original_transformer_config.target in PORTED_TRANSFORMERS
+    ), f"{original_transformer_config.target} has not yet been ported to diffusers."
+    assert (
+        original_content_embedding_config.target in PORTED_CONTENT_EMBEDDINGS
+    ), f"{original_content_embedding_config.target} has not yet been ported to diffusers."
+
+    original_diffusion_config = original_diffusion_config.params
+    original_transformer_config = original_transformer_config.params
+    original_content_embedding_config = original_content_embedding_config.params
+
+    inner_dim = original_transformer_config["n_embd"]
+
+    n_heads = original_transformer_config["n_head"]
+
+    # VQ-Diffusion gives dimension of the multi-headed attention layers as the
+    # number of attention heads times the sequence length (the dimension) of a
+    # single head. We want to specify our attention blocks with those values
+    # specified separately
+    assert inner_dim % n_heads == 0
+    d_head = inner_dim // n_heads
+
+    depth = original_transformer_config["n_layer"]
+    context_dim = original_transformer_config["condition_dim"]
+
+    num_embed = original_content_embedding_config["num_embed"]
+    # the number of embeddings in the transformer includes the mask embedding.
+    # the content embedding (the vqvae) does not include the mask embedding.
+    num_embed = num_embed + 1
+
+    height = original_transformer_config["content_spatial_size"][0]
+    width = original_transformer_config["content_spatial_size"][1]
+
+    assert width == height, "width has to be equal to height"
+    dropout = original_transformer_config["resid_pdrop"]
+    num_embeds_ada_norm = original_diffusion_config["diffusion_step"]
+
+    model_kwargs = {
+        "attention_bias": True,
+        "cross_attention_dim": context_dim,
+        "attention_head_dim": d_head,
+        "num_layers": depth,
+        "dropout": dropout,
+        "num_attention_heads": n_heads,
+        "num_vector_embeds": num_embed,
+        "num_embeds_ada_norm": num_embeds_ada_norm,
+        "norm_num_groups": 32,
+        "sample_size": width,
+        "activation_fn": "geglu-approximate",
+    }
+
+    model = Transformer2DModel(**model_kwargs)
+    return model
+
+
+# done transformer model
+
+# transformer checkpoint
+
+
+def transformer_original_checkpoint_to_diffusers_checkpoint(model, checkpoint):
+    diffusers_checkpoint = {}
+
+    transformer_prefix = "transformer.transformer"
+
+    diffusers_latent_image_embedding_prefix = "latent_image_embedding"
+    latent_image_embedding_prefix = f"{transformer_prefix}.content_emb"
+
+    # DalleMaskImageEmbedding
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_latent_image_embedding_prefix}.emb.weight": checkpoint[
+                f"{latent_image_embedding_prefix}.emb.weight"
+            ],
+            f"{diffusers_latent_image_embedding_prefix}.height_emb.weight": checkpoint[
+                f"{latent_image_embedding_prefix}.height_emb.weight"
+            ],
+            f"{diffusers_latent_image_embedding_prefix}.width_emb.weight": checkpoint[
+                f"{latent_image_embedding_prefix}.width_emb.weight"
+            ],
+        }
+    )
+
+    # transformer blocks
+    for transformer_block_idx, transformer_block in enumerate(model.transformer_blocks):
+        diffusers_transformer_block_prefix = f"transformer_blocks.{transformer_block_idx}"
+        transformer_block_prefix = f"{transformer_prefix}.blocks.{transformer_block_idx}"
+
+        # ada norm block
+        diffusers_ada_norm_prefix = f"{diffusers_transformer_block_prefix}.norm1"
+        ada_norm_prefix = f"{transformer_block_prefix}.ln1"
+
+        diffusers_checkpoint.update(
+            transformer_ada_norm_to_diffusers_checkpoint(
+                checkpoint, diffusers_ada_norm_prefix=diffusers_ada_norm_prefix, ada_norm_prefix=ada_norm_prefix
+            )
+        )
+
+        # attention block
+        diffusers_attention_prefix = f"{diffusers_transformer_block_prefix}.attn1"
+        attention_prefix = f"{transformer_block_prefix}.attn1"
+
+        diffusers_checkpoint.update(
+            transformer_attention_to_diffusers_checkpoint(
+                checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
+            )
+        )
+
+        # ada norm block
+        diffusers_ada_norm_prefix = f"{diffusers_transformer_block_prefix}.norm2"
+        ada_norm_prefix = f"{transformer_block_prefix}.ln1_1"
+
+        diffusers_checkpoint.update(
+            transformer_ada_norm_to_diffusers_checkpoint(
+                checkpoint, diffusers_ada_norm_prefix=diffusers_ada_norm_prefix, ada_norm_prefix=ada_norm_prefix
+            )
+        )
+
+        # attention block
+        diffusers_attention_prefix = f"{diffusers_transformer_block_prefix}.attn2"
+        attention_prefix = f"{transformer_block_prefix}.attn2"
+
+        diffusers_checkpoint.update(
+            transformer_attention_to_diffusers_checkpoint(
+                checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix
+            )
+        )
+
+        # norm block
+        diffusers_norm_block_prefix = f"{diffusers_transformer_block_prefix}.norm3"
+        norm_block_prefix = f"{transformer_block_prefix}.ln2"
+
+        diffusers_checkpoint.update(
+            {
+                f"{diffusers_norm_block_prefix}.weight": checkpoint[f"{norm_block_prefix}.weight"],
+                f"{diffusers_norm_block_prefix}.bias": checkpoint[f"{norm_block_prefix}.bias"],
+            }
+        )
+
+        # feedforward block
+        diffusers_feedforward_prefix = f"{diffusers_transformer_block_prefix}.ff"
+        feedforward_prefix = f"{transformer_block_prefix}.mlp"
+
+        diffusers_checkpoint.update(
+            transformer_feedforward_to_diffusers_checkpoint(
+                checkpoint,
+                diffusers_feedforward_prefix=diffusers_feedforward_prefix,
+                feedforward_prefix=feedforward_prefix,
+            )
+        )
+
+    # to logits
+
+    diffusers_norm_out_prefix = "norm_out"
+    norm_out_prefix = f"{transformer_prefix}.to_logits.0"
+
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_norm_out_prefix}.weight": checkpoint[f"{norm_out_prefix}.weight"],
+            f"{diffusers_norm_out_prefix}.bias": checkpoint[f"{norm_out_prefix}.bias"],
+        }
+    )
+
+    diffusers_out_prefix = "out"
+    out_prefix = f"{transformer_prefix}.to_logits.1"
+
+    diffusers_checkpoint.update(
+        {
+            f"{diffusers_out_prefix}.weight": checkpoint[f"{out_prefix}.weight"],
+            f"{diffusers_out_prefix}.bias": checkpoint[f"{out_prefix}.bias"],
+        }
+    )
+
+    return diffusers_checkpoint
+
+
+def transformer_ada_norm_to_diffusers_checkpoint(checkpoint, *, diffusers_ada_norm_prefix, ada_norm_prefix):
+    return {
+        f"{diffusers_ada_norm_prefix}.emb.weight": checkpoint[f"{ada_norm_prefix}.emb.weight"],
+        f"{diffusers_ada_norm_prefix}.linear.weight": checkpoint[f"{ada_norm_prefix}.linear.weight"],
+        f"{diffusers_ada_norm_prefix}.linear.bias": checkpoint[f"{ada_norm_prefix}.linear.bias"],
+    }
+
+
+def transformer_attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix):
+    return {
+        # key
+        f"{diffusers_attention_prefix}.to_k.weight": checkpoint[f"{attention_prefix}.key.weight"],
+        f"{diffusers_attention_prefix}.to_k.bias": checkpoint[f"{attention_prefix}.key.bias"],
+        # query
+        f"{diffusers_attention_prefix}.to_q.weight": checkpoint[f"{attention_prefix}.query.weight"],
+        f"{diffusers_attention_prefix}.to_q.bias": checkpoint[f"{attention_prefix}.query.bias"],
+        # value
+        f"{diffusers_attention_prefix}.to_v.weight": checkpoint[f"{attention_prefix}.value.weight"],
+        f"{diffusers_attention_prefix}.to_v.bias": checkpoint[f"{attention_prefix}.value.bias"],
+        # linear out
+        f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{attention_prefix}.proj.weight"],
+        f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{attention_prefix}.proj.bias"],
+    }
+
+
+def transformer_feedforward_to_diffusers_checkpoint(checkpoint, *, diffusers_feedforward_prefix, feedforward_prefix):
+    return {
+        f"{diffusers_feedforward_prefix}.net.0.proj.weight": checkpoint[f"{feedforward_prefix}.0.weight"],
+        f"{diffusers_feedforward_prefix}.net.0.proj.bias": checkpoint[f"{feedforward_prefix}.0.bias"],
+        f"{diffusers_feedforward_prefix}.net.2.weight": checkpoint[f"{feedforward_prefix}.2.weight"],
+        f"{diffusers_feedforward_prefix}.net.2.bias": checkpoint[f"{feedforward_prefix}.2.bias"],
+    }
+
+
+# done transformer checkpoint
+
+
+def read_config_file(filename):
+    # The yaml file contains annotations that certain values should
+    # loaded as tuples. By default, OmegaConf will panic when reading
+    # these. Instead, we can manually read the yaml with the FullLoader and then
+    # construct the OmegaConf object.
+    with open(filename) as f:
+        original_config = yaml.load(f, FullLoader)
+
+    return OmegaConf.create(original_config)
+
+
+# We take separate arguments for the vqvae because the ITHQ vqvae config file
+# is separate from the config file for the rest of the model.
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--vqvae_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the vqvae checkpoint to convert.",
+    )
+
+    parser.add_argument(
+        "--vqvae_original_config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The YAML config file corresponding to the original architecture for the vqvae.",
+    )
+
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
+    )
+
+    parser.add_argument(
+        "--original_config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The YAML config file corresponding to the original architecture.",
+    )
+
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+
+    parser.add_argument(
+        "--checkpoint_load_device",
+        default="cpu",
+        type=str,
+        required=False,
+        help="The device passed to `map_location` when loading checkpoints.",
+    )
+
+    # See link for how ema weights are always selected
+    # https://github.com/microsoft/VQ-Diffusion/blob/3c98e77f721db7c787b76304fa2c96a36c7b00af/inference_VQ_Diffusion.py#L65
+    parser.add_argument(
+        "--no_use_ema",
+        action="store_true",
+        required=False,
+        help=(
+            "Set to not use the ema weights from the original VQ-Diffusion checkpoint. You probably do not want to set"
+            " it as the original VQ-Diffusion always uses the ema weights when loading models."
+        ),
+    )
+
+    args = parser.parse_args()
+
+    use_ema = not args.no_use_ema
+
+    print(f"loading checkpoints to {args.checkpoint_load_device}")
+
+    checkpoint_map_location = torch.device(args.checkpoint_load_device)
+
+    # vqvae_model
+
+    print(f"loading vqvae, config: {args.vqvae_original_config_file}, checkpoint: {args.vqvae_checkpoint_path}")
+
+    vqvae_original_config = read_config_file(args.vqvae_original_config_file).model
+    vqvae_checkpoint = torch.load(args.vqvae_checkpoint_path, map_location=checkpoint_map_location)["model"]
+
+    with init_empty_weights():
+        vqvae_model = vqvae_model_from_original_config(vqvae_original_config)
+
+    vqvae_diffusers_checkpoint = vqvae_original_checkpoint_to_diffusers_checkpoint(vqvae_model, vqvae_checkpoint)
+
+    with tempfile.NamedTemporaryFile() as vqvae_diffusers_checkpoint_file:
+        torch.save(vqvae_diffusers_checkpoint, vqvae_diffusers_checkpoint_file.name)
+        del vqvae_diffusers_checkpoint
+        del vqvae_checkpoint
+        load_checkpoint_and_dispatch(vqvae_model, vqvae_diffusers_checkpoint_file.name, device_map="auto")
+
+    print("done loading vqvae")
+
+    # done vqvae_model
+
+    # transformer_model
+
+    print(
+        f"loading transformer, config: {args.original_config_file}, checkpoint: {args.checkpoint_path}, use ema:"
+        f" {use_ema}"
+    )
+
+    original_config = read_config_file(args.original_config_file).model
+
+    diffusion_config = original_config.params.diffusion_config
+    transformer_config = original_config.params.diffusion_config.params.transformer_config
+    content_embedding_config = original_config.params.diffusion_config.params.content_emb_config
+
+    pre_checkpoint = torch.load(args.checkpoint_path, map_location=checkpoint_map_location)
+
+    if use_ema:
+        if "ema" in pre_checkpoint:
+            checkpoint = {}
+            for k, v in pre_checkpoint["model"].items():
+                checkpoint[k] = v
+
+            for k, v in pre_checkpoint["ema"].items():
+                # The ema weights are only used on the transformer. To mimic their key as if they came
+                # from the state_dict for the top level model, we prefix with an additional "transformer."
+                # See the source linked in the args.use_ema config for more information.
+                checkpoint[f"transformer.{k}"] = v
+        else:
+            print("attempted to load ema weights but no ema weights are specified in the loaded checkpoint.")
+            checkpoint = pre_checkpoint["model"]
+    else:
+        checkpoint = pre_checkpoint["model"]
+
+    del pre_checkpoint
+
+    with init_empty_weights():
+        transformer_model = transformer_model_from_original_config(
+            diffusion_config, transformer_config, content_embedding_config
+        )
+
+    diffusers_transformer_checkpoint = transformer_original_checkpoint_to_diffusers_checkpoint(
+        transformer_model, checkpoint
+    )
+
+    with tempfile.NamedTemporaryFile() as diffusers_transformer_checkpoint_file:
+        torch.save(diffusers_transformer_checkpoint, diffusers_transformer_checkpoint_file.name)
+        del diffusers_transformer_checkpoint
+        del checkpoint
+        load_checkpoint_and_dispatch(transformer_model, diffusers_transformer_checkpoint_file.name, device_map="auto")
+
+    print("done loading transformer")
+
+    # done transformer_model
+
+    # text encoder
+
+    print("loading CLIP text encoder")
+
+    clip_name = "openai/clip-vit-base-patch32"
+
+    # The original VQ-Diffusion specifies the pad value by the int used in the
+    # returned tokens. Each model uses `0` as the pad value. The transformers clip api
+    # specifies the pad value via the token before it has been tokenized. The `!` pad
+    # token is the same as padding with the `0` pad value.
+    pad_token = "!"
+
+    tokenizer_model = CLIPTokenizer.from_pretrained(clip_name, pad_token=pad_token, device_map="auto")
+
+    assert tokenizer_model.convert_tokens_to_ids(pad_token) == 0
+
+    text_encoder_model = CLIPTextModel.from_pretrained(
+        clip_name,
+        # `CLIPTextModel` does not support device_map="auto"
+        # device_map="auto"
+    )
+
+    print("done loading CLIP text encoder")
+
+    # done text encoder
+
+    # scheduler
+
+    scheduler_model = VQDiffusionScheduler(
+        # the scheduler has the same number of embeddings as the transformer
+        num_vec_classes=transformer_model.num_vector_embeds
+    )
+
+    # done scheduler
+
+    print(f"saving VQ diffusion model, path: {args.dump_path}")
+
+    pipe = VQDiffusionPipeline(
+        vqvae=vqvae_model,
+        transformer=transformer_model,
+        tokenizer=tokenizer_model,
+        text_encoder=text_encoder_model,
+        scheduler=scheduler_model,
+    )
+    pipe.save_pretrained(args.dump_path)
+
+    print("done writing VQ diffusion model")
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 1a9a7d74ebee..00052109e32f 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -18,7 +18,7 @@
 
 if is_torch_available():
     from .modeling_utils import ModelMixin
-    from .models import AutoencoderKL, UNet1DModel, UNet2DConditionModel, UNet2DModel, VQModel
+    from .models import AutoencoderKL, Transformer2DModel, UNet1DModel, UNet2DConditionModel, UNet2DModel, VQModel
     from .optimization import (
         get_constant_schedule,
         get_constant_schedule_with_warmup,
@@ -38,6 +38,7 @@
         PNDMPipeline,
         RePaintPipeline,
         ScoreSdeVePipeline,
+        VQDiffusionPipeline,
     )
     from .schedulers import (
         DDIMScheduler,
@@ -50,6 +51,7 @@
         RePaintScheduler,
         SchedulerMixin,
         ScoreSdeVeScheduler,
+        VQDiffusionScheduler,
     )
     from .training_utils import EMAModel
 else:
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index c5d53b2feb4b..5b101d169148 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -16,6 +16,7 @@
 
 
 if is_torch_available():
+    from .attention import Transformer2DModel
     from .unet_1d import UNet1DModel
     from .unet_2d import UNet2DModel
     from .unet_2d_condition import UNet2DConditionModel
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 372c8492b485..bac85e2f39cf 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -12,13 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
+from dataclasses import dataclass
 from typing import Optional
 
 import torch
 import torch.nn.functional as F
 from torch import nn
 
-from diffusers.utils.import_utils import is_xformers_available
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..modeling_utils import ModelMixin
+from ..models.embeddings import ImagePositionalEmbeddings
+from ..utils import BaseOutput
+from ..utils.import_utils import is_xformers_available
+
+
+@dataclass
+class Transformer2DModelOutput(BaseOutput):
+    """
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            Hidden states conditioned on `encoder_hidden_states` input. If discrete, returns probability distributions
+            for the unnoised latent pixels.
+    """
+
+    sample: torch.FloatTensor
 
 
 if is_xformers_available():
@@ -28,6 +45,186 @@
     xformers = None
 
 
+class Transformer2DModel(ModelMixin, ConfigMixin):
+    """
+    Transformer model for image-like data. Takes either discrete (classes of vector embeddings) or continuous (actual
+    embeddings) inputs.
+
+    When input is continuous: First, project the input (aka embedding) and reshape to b, t, d. Then apply standard
+    transformer action. Finally, reshape to image.
+
+    When input is discrete: First, input (classes of latent pixels) is converted to embeddings and has positional
+    embeddings applied, see `ImagePositionalEmbeddings`. Then apply standard transformer action. Finally, predict
+    classes of unnoised image.
+
+    Note that it is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised
+    image do not contain a prediction for the masked pixel as the unnoised image cannot be masked.
+
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            Pass if the input is continuous. The number of channels in the input and output.
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.1): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of context dimensions to use.
+        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
+            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
+            `ImagePositionalEmbeddings`.
+        num_vector_embeds (`int`, *optional*):
+            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
+            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
+            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
+            up to but not more than steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the TransformerBlocks' attention should contain a bias parameter.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+
+        # 1. Transformer2DModel can process both standard continous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+        # Define whether input is continuous or discrete depending on configuration
+        self.is_input_continuous = in_channels is not None
+        self.is_input_vectorized = num_vector_embeds is not None
+
+        if self.is_input_continuous and self.is_input_vectorized:
+            raise ValueError(
+                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is None."
+            )
+        elif not self.is_input_continuous and not self.is_input_vectorized:
+            raise ValueError(
+                f"Has to define either `in_channels`: {in_channels} or `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is not None."
+            )
+
+        # 2. Define input layers
+        if self.is_input_continuous:
+            self.in_channels = in_channels
+
+            self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
+
+            self.height = sample_size
+            self.width = sample_size
+            self.num_vector_embeds = num_vector_embeds
+            self.num_latent_pixels = self.height * self.width
+
+            self.latent_image_embedding = ImagePositionalEmbeddings(
+                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
+            )
+
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                )
+                for d in range(num_layers)
+            ]
+        )
+
+        # 4. Define output layers
+        if self.is_input_continuous:
+            self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            self.norm_out = nn.LayerNorm(inner_dim)
+            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
+
+    def _set_attention_slice(self, slice_size):
+        for block in self.transformer_blocks:
+            block._set_attention_slice(slice_size)
+
+    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, return_dict: bool = True):
+        """
+        Args:
+            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
+                When continous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
+                hidden_states
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, context dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.long`, *optional*):
+                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.attention.Transformer2DModelOutput`] or `tuple`: [`~models.attention.Transformer2DModelOutput`]
+            if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample
+            tensor.
+        """
+        # 1. Input
+        if self.is_input_continuous:
+            batch, channel, height, weight = hidden_states.shape
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+            hidden_states = self.proj_in(hidden_states)
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+        elif self.is_input_vectorized:
+            hidden_states = self.latent_image_embedding(hidden_states)
+
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(hidden_states, context=encoder_hidden_states, timestep=timestep)
+
+        # 3. Output
+        if self.is_input_continuous:
+            hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2)
+            hidden_states = self.proj_out(hidden_states)
+            output = hidden_states + residual
+        elif self.is_input_vectorized:
+            hidden_states = self.norm_out(hidden_states)
+            logits = self.out(hidden_states)
+            # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+            logits = logits.permute(0, 2, 1)
+
+            # log(p(x_0))
+            output = F.log_softmax(logits.double(), dim=1).float()
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
+
+    def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for block in self.transformer_blocks:
+            block._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
+
 class AttentionBlock(nn.Module):
     """
     An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
@@ -36,19 +233,19 @@ class AttentionBlock(nn.Module):
     Uses three q, k, v linear layers to compute attention.
 
     Parameters:
-        channels (:obj:`int`): The number of channels in the input and output.
-        num_head_channels (:obj:`int`, *optional*):
+        channels (`int`): The number of channels in the input and output.
+        num_head_channels (`int`, *optional*):
             The number of channels in each head. If None, then `num_heads` = 1.
-        num_groups (:obj:`int`, *optional*, defaults to 32): The number of groups to use for group norm.
-        rescale_output_factor (:obj:`float`, *optional*, defaults to 1.0): The factor to rescale the output by.
-        eps (:obj:`float`, *optional*, defaults to 1e-5): The epsilon value to use for group norm.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for group norm.
+        rescale_output_factor (`float`, *optional*, defaults to 1.0): The factor to rescale the output by.
+        eps (`float`, *optional*, defaults to 1e-5): The epsilon value to use for group norm.
     """
 
     def __init__(
         self,
         channels: int,
         num_head_channels: Optional[int] = None,
-        num_groups: int = 32,
+        norm_num_groups: int = 32,
         rescale_output_factor: float = 1.0,
         eps: float = 1e-5,
     ):
@@ -57,7 +254,7 @@ def __init__(
 
         self.num_heads = channels // num_head_channels if num_head_channels is not None else 1
         self.num_head_size = num_head_channels
-        self.group_norm = nn.GroupNorm(num_channels=channels, num_groups=num_groups, eps=eps, affine=True)
+        self.group_norm = nn.GroupNorm(num_channels=channels, num_groups=norm_num_groups, eps=eps, affine=True)
 
         # define q,k,v as linear layers
         self.query = nn.Linear(channels, channels)
@@ -113,107 +310,61 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-class SpatialTransformer(nn.Module):
-    """
-    Transformer block for image-like data. First, project the input (aka embedding) and reshape to b, t, d. Then apply
-    standard transformer action. Finally, reshape to image.
-
-    Parameters:
-        in_channels (:obj:`int`): The number of channels in the input and output.
-        n_heads (:obj:`int`): The number of heads to use for multi-head attention.
-        d_head (:obj:`int`): The number of channels in each head.
-        depth (:obj:`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
-        dropout (:obj:`float`, *optional*, defaults to 0.1): The dropout probability to use.
-        context_dim (:obj:`int`, *optional*): The number of context dimensions to use.
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        n_heads: int,
-        d_head: int,
-        depth: int = 1,
-        dropout: float = 0.0,
-        num_groups: int = 32,
-        context_dim: Optional[int] = None,
-    ):
-        super().__init__()
-        self.n_heads = n_heads
-        self.d_head = d_head
-        self.in_channels = in_channels
-        inner_dim = n_heads * d_head
-        self.norm = torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
-
-        self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
-
-        self.transformer_blocks = nn.ModuleList(
-            [
-                BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim)
-                for d in range(depth)
-            ]
-        )
-
-        self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
-
-    def _set_attention_slice(self, slice_size):
-        for block in self.transformer_blocks:
-            block._set_attention_slice(slice_size)
-
-    def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
-        for block in self.transformer_blocks:
-            block._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
-
-    def forward(self, hidden_states, context=None):
-        # note: if no context is given, cross-attention defaults to self-attention
-        batch, channel, height, width = hidden_states.shape
-        residual = hidden_states
-        hidden_states = self.norm(hidden_states)
-        hidden_states = self.proj_in(hidden_states)
-        inner_dim = hidden_states.shape[1]
-        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
-        for block in self.transformer_blocks:
-            hidden_states = block(hidden_states, context=context)
-        hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2)
-        hidden_states = self.proj_out(hidden_states)
-        return hidden_states + residual
-
-
 class BasicTransformerBlock(nn.Module):
     r"""
     A basic Transformer block.
 
     Parameters:
-        dim (:obj:`int`): The number of channels in the input and output.
-        n_heads (:obj:`int`): The number of heads to use for multi-head attention.
-        d_head (:obj:`int`): The number of channels in each head.
-        dropout (:obj:`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        context_dim (:obj:`int`, *optional*): The size of the context vector for cross attention.
-        gated_ff (:obj:`bool`, *optional*, defaults to :obj:`False`): Whether to use a gated feed-forward network.
-        checkpoint (:obj:`bool`, *optional*, defaults to :obj:`False`): Whether to use checkpointing.
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the context vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
     """
 
     def __init__(
         self,
         dim: int,
-        n_heads: int,
-        d_head: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
         dropout=0.0,
-        context_dim: Optional[int] = None,
-        gated_ff: bool = True,
-        checkpoint: bool = True,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
     ):
         super().__init__()
         self.attn1 = CrossAttention(
-            query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
         )  # is a self-attention
-        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
         self.attn2 = CrossAttention(
-            query_dim=dim, context_dim=context_dim, heads=n_heads, dim_head=d_head, dropout=dropout
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
         )  # is self-attn if context is none
-        self.norm1 = nn.LayerNorm(dim)
-        self.norm2 = nn.LayerNorm(dim)
+
+        # layer norms
+        self.use_ada_layer_norm = num_embeds_ada_norm is not None
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+            self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        else:
+            self.norm1 = nn.LayerNorm(dim)
+            self.norm2 = nn.LayerNorm(dim)
         self.norm3 = nn.LayerNorm(dim)
-        self.checkpoint = checkpoint
 
     def _set_attention_slice(self, slice_size):
         self.attn1._slice_size = slice_size
@@ -245,10 +396,22 @@ def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_atte
             self.attn1._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
             self.attn2._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
 
-    def forward(self, hidden_states, context=None):
-        hidden_states = self.attn1(self.norm1(hidden_states)) + hidden_states
-        hidden_states = self.attn2(self.norm2(hidden_states), context=context) + hidden_states
+    def forward(self, hidden_states, context=None, timestep=None):
+        # 1. Self-Attention
+        norm_hidden_states = (
+            self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(hidden_states)
+        )
+        hidden_states = self.attn1(norm_hidden_states) + hidden_states
+
+        # 2. Cross-Attention
+        norm_hidden_states = (
+            self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+        )
+        hidden_states = self.attn2(norm_hidden_states, context=context) + hidden_states
+
+        # 3. Feed-forward
         hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
+
         return hidden_states
 
 
@@ -257,20 +420,28 @@ class CrossAttention(nn.Module):
     A cross attention layer.
 
     Parameters:
-        query_dim (:obj:`int`): The number of channels in the query.
-        context_dim (:obj:`int`, *optional*):
+        query_dim (`int`): The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
             The number of channels in the context. If not given, defaults to `query_dim`.
-        heads (:obj:`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.
-        dim_head (:obj:`int`,  *optional*, defaults to 64): The number of channels in each head.
-        dropout (:obj:`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        heads (`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.
+        dim_head (`int`,  *optional*, defaults to 64): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
     """
 
     def __init__(
-        self, query_dim: int, context_dim: Optional[int] = None, heads: int = 8, dim_head: int = 64, dropout: int = 0.0
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias=False,
     ):
         super().__init__()
         inner_dim = dim_head * heads
-        context_dim = context_dim if context_dim is not None else query_dim
+        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
 
         self.scale = dim_head**-0.5
         self.heads = heads
@@ -280,9 +451,9 @@ def __init__(
         self._slice_size = None
         self._use_memory_efficient_attention_xformers = False
 
-        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
-        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
+        self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+        self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
 
         self.to_out = nn.ModuleList([])
         self.to_out.append(nn.Linear(inner_dim, query_dim))
@@ -394,23 +565,33 @@ class FeedForward(nn.Module):
     A feed-forward layer.
 
     Parameters:
-        dim (:obj:`int`): The number of channels in the input.
-        dim_out (:obj:`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
-        mult (:obj:`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
-        glu (:obj:`bool`, *optional*, defaults to :obj:`False`): Whether to use GLU activation.
-        dropout (:obj:`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
     """
 
     def __init__(
-        self, dim: int, dim_out: Optional[int] = None, mult: int = 4, glu: bool = False, dropout: float = 0.0
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
     ):
         super().__init__()
         inner_dim = int(dim * mult)
         dim_out = dim_out if dim_out is not None else dim
-        self.net = nn.ModuleList([])
 
+        if activation_fn == "geglu":
+            geglu = GEGLU(dim, inner_dim)
+        elif activation_fn == "geglu-approximate":
+            geglu = ApproximateGELU(dim, inner_dim)
+
+        self.net = nn.ModuleList([])
         # project in
-        self.net.append(GEGLU(dim, inner_dim))
+        self.net.append(geglu)
         # project dropout
         self.net.append(nn.Dropout(dropout))
         # project out
@@ -428,8 +609,8 @@ class GEGLU(nn.Module):
     A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
 
     Parameters:
-        dim_in (:obj:`int`): The number of channels in the input.
-        dim_out (:obj:`int`): The number of channels in the output.
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
     """
 
     def __init__(self, dim_in: int, dim_out: int):
@@ -445,3 +626,38 @@ def gelu(self, gate):
     def forward(self, hidden_states):
         hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
         return hidden_states * self.gelu(gate)
+
+
+class ApproximateGELU(nn.Module):
+    """
+    The approximate form of Gaussian Error Linear Unit (GELU)
+
+    For more details, see section 2: https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out)
+
+    def forward(self, x):
+        x = self.proj(x)
+        return x * torch.sigmoid(1.702 * x)
+
+
+class AdaLayerNorm(nn.Module):
+    """
+    Norm layer modified to incorporate timestep embeddings.
+    """
+
+    def __init__(self, embedding_dim, num_embeddings):
+        super().__init__()
+        self.emb = nn.Embedding(num_embeddings, embedding_dim)
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, embedding_dim * 2)
+        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False)
+
+    def forward(self, x, timestep):
+        emb = self.linear(self.silu(self.emb(timestep)))
+        scale, shift = torch.chunk(emb, 2)
+        x = self.norm(x) * (1 + scale) + shift
+        return x
diff --git a/src/diffusers/models/attention_flax.py b/src/diffusers/models/attention_flax.py
index 1745265b91e1..1b8609474750 100644
--- a/src/diffusers/models/attention_flax.py
+++ b/src/diffusers/models/attention_flax.py
@@ -142,7 +142,7 @@ def __call__(self, hidden_states, context, deterministic=True):
         return hidden_states
 
 
-class FlaxSpatialTransformer(nn.Module):
+class FlaxTransformer2DModel(nn.Module):
     r"""
     A Spatial Transformer layer with Gated Linear Unit (GLU) activation function as described in:
     https://arxiv.org/pdf/1506.02025.pdf
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index 35715e17fc47..b09d43fc2edf 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -126,3 +126,68 @@ def forward(self, x):
         else:
             out = torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
         return out
+
+
+class ImagePositionalEmbeddings(nn.Module):
+    """
+    Converts latent image classes into vector embeddings. Sums the vector embeddings with positional embeddings for the
+    height and width of the latent space.
+
+    For more details, see figure 10 of the dall-e paper: https://arxiv.org/abs/2102.12092
+
+    For VQ-diffusion:
+
+    Output vector embeddings are used as input for the transformer.
+
+    Note that the vector embeddings for the transformer are different than the vector embeddings from the VQVAE.
+
+    Args:
+        num_embed (`int`):
+            Number of embeddings for the latent pixels embeddings.
+        height (`int`):
+            Height of the latent image i.e. the number of height embeddings.
+        width (`int`):
+            Width of the latent image i.e. the number of width embeddings.
+        embed_dim (`int`):
+            Dimension of the produced vector embeddings. Used for the latent pixel, height, and width embeddings.
+    """
+
+    def __init__(
+        self,
+        num_embed: int,
+        height: int,
+        width: int,
+        embed_dim: int,
+    ):
+        super().__init__()
+
+        self.height = height
+        self.width = width
+        self.num_embed = num_embed
+        self.embed_dim = embed_dim
+
+        self.emb = nn.Embedding(self.num_embed, embed_dim)
+        self.height_emb = nn.Embedding(self.height, embed_dim)
+        self.width_emb = nn.Embedding(self.width, embed_dim)
+
+    def forward(self, index):
+        emb = self.emb(index)
+
+        height_emb = self.height_emb(torch.arange(self.height, device=index.device).view(1, self.height))
+
+        # 1 x H x D -> 1 x H x 1 x D
+        height_emb = height_emb.unsqueeze(2)
+
+        width_emb = self.width_emb(torch.arange(self.width, device=index.device).view(1, self.width))
+
+        # 1 x W x D -> 1 x 1 x W x D
+        width_emb = width_emb.unsqueeze(1)
+
+        pos_emb = height_emb + width_emb
+
+        # 1 x H x W x D -> 1 x L xD
+        pos_emb = pos_emb.view(1, self.height * self.width, -1)
+
+        emb = emb + pos_emb[:, : emb.shape[1], :]
+
+        return emb
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index ae4fe2d8bba7..4132ccbd0ce0 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -15,7 +15,7 @@
 import torch
 from torch import nn
 
-from .attention import AttentionBlock, SpatialTransformer
+from .attention import AttentionBlock, Transformer2DModel
 from .resnet import Downsample2D, FirDownsample2D, FirUpsample2D, ResnetBlock2D, Upsample2D
 
 
@@ -109,6 +109,19 @@ def get_down_block(
             resnet_groups=resnet_groups,
             downsample_padding=downsample_padding,
         )
+    elif down_block_type == "AttnDownEncoderBlock2D":
+        return AttnDownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
 
 
 def get_up_block(
@@ -200,6 +213,17 @@ def get_up_block(
             resnet_act_fn=resnet_act_fn,
             resnet_groups=resnet_groups,
         )
+    elif up_block_type == "AttnUpDecoderBlock2D":
+        return AttnUpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attn_num_head_channels=attn_num_head_channels,
+        )
     raise ValueError(f"{up_block_type} does not exist.")
 
 
@@ -249,7 +273,7 @@ def __init__(
                     num_head_channels=attn_num_head_channels,
                     rescale_output_factor=output_scale_factor,
                     eps=resnet_eps,
-                    num_groups=resnet_groups,
+                    norm_num_groups=resnet_groups,
                 )
             )
             resnets.append(
@@ -325,13 +349,13 @@ def __init__(
 
         for _ in range(num_layers):
             attentions.append(
-                SpatialTransformer(
-                    in_channels,
+                Transformer2DModel(
                     attn_num_head_channels,
                     in_channels // attn_num_head_channels,
-                    depth=1,
-                    context_dim=cross_attention_dim,
-                    num_groups=resnet_groups,
+                    in_channels=in_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
                 )
             )
             resnets.append(
@@ -374,7 +398,7 @@ def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_atten
     def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
         hidden_states = self.resnets[0](hidden_states, temb)
         for attn, resnet in zip(self.attentions, self.resnets[1:]):
-            hidden_states = attn(hidden_states, encoder_hidden_states)
+            hidden_states = attn(hidden_states, encoder_hidden_states).sample
             hidden_states = resnet(hidden_states, temb)
 
         return hidden_states
@@ -427,7 +451,7 @@ def __init__(
                     num_head_channels=attn_num_head_channels,
                     rescale_output_factor=output_scale_factor,
                     eps=resnet_eps,
-                    num_groups=resnet_groups,
+                    norm_num_groups=resnet_groups,
                 )
             )
 
@@ -506,13 +530,13 @@ def __init__(
                 )
             )
             attentions.append(
-                SpatialTransformer(
-                    out_channels,
+                Transformer2DModel(
                     attn_num_head_channels,
                     out_channels // attn_num_head_channels,
-                    depth=1,
-                    context_dim=cross_attention_dim,
-                    num_groups=resnet_groups,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
                 )
             )
         self.attentions = nn.ModuleList(attentions)
@@ -556,19 +580,22 @@ def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
         for resnet, attn in zip(self.resnets, self.attentions):
             if self.training and self.gradient_checkpointing:
 
-                def create_custom_forward(module):
+                def create_custom_forward(module, return_dict=None):
                     def custom_forward(*inputs):
-                        return module(*inputs)
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
 
                     return custom_forward
 
                 hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
                 hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(attn), hidden_states, encoder_hidden_states
-                )
+                    create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states
+                )[0]
             else:
                 hidden_states = resnet(hidden_states, temb)
-                hidden_states = attn(hidden_states, context=encoder_hidden_states)
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
 
             output_states += (hidden_states,)
 
@@ -763,7 +790,7 @@ def __init__(
                     num_head_channels=attn_num_head_channels,
                     rescale_output_factor=output_scale_factor,
                     eps=resnet_eps,
-                    num_groups=resnet_groups,
+                    norm_num_groups=resnet_groups,
                 )
             )
 
@@ -1014,7 +1041,7 @@ def __init__(
                     num_head_channels=attn_num_head_channels,
                     rescale_output_factor=output_scale_factor,
                     eps=resnet_eps,
-                    num_groups=resnet_groups,
+                    norm_num_groups=resnet_groups,
                 )
             )
 
@@ -1089,13 +1116,13 @@ def __init__(
                 )
             )
             attentions.append(
-                SpatialTransformer(
-                    out_channels,
+                Transformer2DModel(
                     attn_num_head_channels,
                     out_channels // attn_num_head_channels,
-                    depth=1,
-                    context_dim=cross_attention_dim,
-                    num_groups=resnet_groups,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
                 )
             )
         self.attentions = nn.ModuleList(attentions)
@@ -1145,19 +1172,22 @@ def forward(
 
             if self.training and self.gradient_checkpointing:
 
-                def create_custom_forward(module):
+                def create_custom_forward(module, return_dict=None):
                     def custom_forward(*inputs):
-                        return module(*inputs)
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
 
                     return custom_forward
 
                 hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
                 hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(attn), hidden_states, encoder_hidden_states
-                )
+                    create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states
+                )[0]
             else:
                 hidden_states = resnet(hidden_states, temb)
-                hidden_states = attn(hidden_states, context=encoder_hidden_states)
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
 
         if self.upsamplers is not None:
             for upsampler in self.upsamplers:
@@ -1337,7 +1367,7 @@ def __init__(
                     num_head_channels=attn_num_head_channels,
                     rescale_output_factor=output_scale_factor,
                     eps=resnet_eps,
-                    num_groups=resnet_groups,
+                    norm_num_groups=resnet_groups,
                 )
             )
 
diff --git a/src/diffusers/models/unet_2d_blocks_flax.py b/src/diffusers/models/unet_2d_blocks_flax.py
index baa71beabe35..5798385b9d28 100644
--- a/src/diffusers/models/unet_2d_blocks_flax.py
+++ b/src/diffusers/models/unet_2d_blocks_flax.py
@@ -15,7 +15,7 @@
 import flax.linen as nn
 import jax.numpy as jnp
 
-from .attention_flax import FlaxSpatialTransformer
+from .attention_flax import FlaxTransformer2DModel
 from .resnet_flax import FlaxDownsample2D, FlaxResnetBlock2D, FlaxUpsample2D
 
 
@@ -63,7 +63,7 @@ def setup(self):
             )
             resnets.append(res_block)
 
-            attn_block = FlaxSpatialTransformer(
+            attn_block = FlaxTransformer2DModel(
                 in_channels=self.out_channels,
                 n_heads=self.attn_num_head_channels,
                 d_head=self.out_channels // self.attn_num_head_channels,
@@ -196,7 +196,7 @@ def setup(self):
             )
             resnets.append(res_block)
 
-            attn_block = FlaxSpatialTransformer(
+            attn_block = FlaxTransformer2DModel(
                 in_channels=self.out_channels,
                 n_heads=self.attn_num_head_channels,
                 d_head=self.out_channels // self.attn_num_head_channels,
@@ -326,7 +326,7 @@ def setup(self):
         attentions = []
 
         for _ in range(self.num_layers):
-            attn_block = FlaxSpatialTransformer(
+            attn_block = FlaxTransformer2DModel(
                 in_channels=self.in_channels,
                 n_heads=self.attn_num_head_channels,
                 d_head=self.in_channels // self.attn_num_head_channels,
diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py
index 5f5a47dada0f..30de343d08ee 100644
--- a/src/diffusers/models/vae.py
+++ b/src/diffusers/models/vae.py
@@ -233,14 +233,16 @@ class VectorQuantizer(nn.Module):
     # NOTE: due to a bug the beta term was applied to the wrong term. for
     # backwards compatibility we use the buggy version by default, but you can
     # specify legacy=False to fix it.
-    def __init__(self, n_e, e_dim, beta, remap=None, unknown_index="random", sane_index_shape=False, legacy=True):
+    def __init__(
+        self, n_e, vq_embed_dim, beta, remap=None, unknown_index="random", sane_index_shape=False, legacy=True
+    ):
         super().__init__()
         self.n_e = n_e
-        self.e_dim = e_dim
+        self.vq_embed_dim = vq_embed_dim
         self.beta = beta
         self.legacy = legacy
 
-        self.embedding = nn.Embedding(self.n_e, self.e_dim)
+        self.embedding = nn.Embedding(self.n_e, self.vq_embed_dim)
         self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
 
         self.remap = remap
@@ -287,7 +289,7 @@ def unmap_to_all(self, inds):
     def forward(self, z):
         # reshape z -> (batch, height, width, channel) and flatten
         z = z.permute(0, 2, 3, 1).contiguous()
-        z_flattened = z.view(-1, self.e_dim)
+        z_flattened = z.view(-1, self.vq_embed_dim)
         # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
 
         d = (
@@ -409,6 +411,7 @@ class VQModel(ModelMixin, ConfigMixin):
         latent_channels (`int`, *optional*, defaults to `3`): Number of channels in the latent space.
         sample_size (`int`, *optional*, defaults to `32`): TODO
         num_vq_embeddings (`int`, *optional*, defaults to `256`): Number of codebook vectors in the VQ-VAE.
+        vq_embed_dim (`int`, *optional*): Hidden dim of codebook vectors in the VQ-VAE.
     """
 
     @register_to_config
@@ -425,6 +428,7 @@ def __init__(
         sample_size: int = 32,
         num_vq_embeddings: int = 256,
         norm_num_groups: int = 32,
+        vq_embed_dim: Optional[int] = None,
     ):
         super().__init__()
 
@@ -440,11 +444,11 @@ def __init__(
             double_z=False,
         )
 
-        self.quant_conv = torch.nn.Conv2d(latent_channels, latent_channels, 1)
-        self.quantize = VectorQuantizer(
-            num_vq_embeddings, latent_channels, beta=0.25, remap=None, sane_index_shape=False
-        )
-        self.post_quant_conv = torch.nn.Conv2d(latent_channels, latent_channels, 1)
+        vq_embed_dim = vq_embed_dim if vq_embed_dim is not None else latent_channels
+
+        self.quant_conv = torch.nn.Conv2d(latent_channels, vq_embed_dim, 1)
+        self.quantize = VectorQuantizer(num_vq_embeddings, vq_embed_dim, beta=0.25, remap=None, sane_index_shape=False)
+        self.post_quant_conv = torch.nn.Conv2d(vq_embed_dim, latent_channels, 1)
 
         # pass init params to Decoder
         self.decoder = Decoder(
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 8015d4e11466..bb3440b2bfbc 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -21,6 +21,7 @@
         StableDiffusionInpaintPipelineLegacy,
         StableDiffusionPipeline,
     )
+    from .vq_diffusion import VQDiffusionPipeline
 
 if is_transformers_available() and is_onnx_available():
     from .stable_diffusion import (
diff --git a/src/diffusers/pipelines/vq_diffusion/__init__.py b/src/diffusers/pipelines/vq_diffusion/__init__.py
new file mode 100644
index 000000000000..edf6f570f5bf
--- /dev/null
+++ b/src/diffusers/pipelines/vq_diffusion/__init__.py
@@ -0,0 +1 @@
+from .pipeline_vq_diffusion import VQDiffusionPipeline
diff --git a/src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py b/src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
new file mode 100644
index 000000000000..6e5325ba7ef5
--- /dev/null
+++ b/src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
@@ -0,0 +1,253 @@
+# Copyright 2022 Microsoft and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+
+from diffusers import Transformer2DModel, VQModel
+from diffusers.schedulers.scheduling_vq_diffusion import VQDiffusionScheduler
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class VQDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using VQ Diffusion
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vqvae ([`VQModel`]):
+            Vector Quantized Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent
+            representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. VQ Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        transformer ([`Transformer2DModel`]):
+            Conditional transformer to denoise the encoded image latents.
+        scheduler ([`VQDiffusionScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+    """
+
+    vqvae: VQModel
+    text_encoder: CLIPTextModel
+    tokenizer: CLIPTokenizer
+    transformer: Transformer2DModel
+    scheduler: VQDiffusionScheduler
+
+    def __init__(
+        self,
+        vqvae: VQModel,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        transformer: Transformer2DModel,
+        scheduler: VQDiffusionScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vqvae=vqvae,
+            transformer=transformer,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+        )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        num_inference_steps: int = 100,
+        truncation_rate: float = 1.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            truncation_rate (`float`, *optional*, defaults to 1.0 (equivalent to no truncation)):
+                Used to "truncate" the predicted classes for x_0 such that the cumulative probability for a pixel is at
+                most `truncation_rate`. The lowest probabilities that would increase the cumulative probability above
+                `truncation_rate` are set to zero.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor` of shape (batch), *optional*):
+                Pre-generated noisy latents to be used as inputs for image generation. Must be valid embedding indices.
+                Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will
+                be generated of completely masked latent pixels.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~ pipeline_utils.ImagePipelineOutput `] if
+            `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+
+        # NOTE: This additional step of normalizing the text embeddings is from VQ-Diffusion.
+        # While CLIP does normalize the pooled output of the text transformer when combining
+        # the image and text embeddings, CLIP does not directly normalize the last hidden state.
+        #
+        # CLIP normalizing the pooled output.
+        # https://github.com/huggingface/transformers/blob/d92e22d1f28324f513f3080e5c47c071a3916721/src/transformers/models/clip/modeling_clip.py#L1052-L1053
+        text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
+
+        # duplicate text embeddings for each generation per prompt
+        text_embeddings = text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+
+        # get the initial completely masked latents unless the user supplied it
+
+        latents_shape = (batch_size, self.transformer.num_latent_pixels)
+        if latents is None:
+            mask_class = self.transformer.num_vector_embeds - 1
+            latents = torch.full(latents_shape, mask_class).to(self.device)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            if (latents < 0).any() or (latents >= self.transformer.num_vector_embeds).any():
+                raise ValueError(
+                    "Unexpected latents value(s). All latents be valid embedding indices i.e. in the range 0,"
+                    f" {self.transformer.num_vector_embeds - 1} (inclusive)."
+                )
+            latents = latents.to(self.device)
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        sample = latents
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # predict the un-noised image
+            # model_output == `log_p_x_0`
+            model_output = self.transformer(sample, encoder_hidden_states=text_embeddings, timestep=t).sample
+
+            model_output = self.truncate(model_output, truncation_rate)
+
+            # remove `log(0)`'s (`-inf`s)
+            model_output = model_output.clamp(-70)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            sample = self.scheduler.step(model_output, timestep=t, sample=sample, generator=generator).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, sample)
+
+        embedding_channels = self.vqvae.config.vq_embed_dim
+        embeddings_shape = (batch_size, self.transformer.height, self.transformer.width, embedding_channels)
+        embeddings = self.vqvae.quantize.get_codebook_entry(sample, shape=embeddings_shape)
+        image = self.vqvae.decode(embeddings, force_not_quantize=True).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
+
+    def truncate(self, log_p_x_0: torch.FloatTensor, truncation_rate: float) -> torch.FloatTensor:
+        """
+        Truncates log_p_x_0 such that for each column vector, the total cumulative probability is `truncation_rate` The
+        lowest probabilities that would increase the cumulative probability above `truncation_rate` are set to zero.
+        """
+        sorted_log_p_x_0, indices = torch.sort(log_p_x_0, 1, descending=True)
+        sorted_p_x_0 = torch.exp(sorted_log_p_x_0)
+        keep_mask = sorted_p_x_0.cumsum(dim=1) < truncation_rate
+
+        # Ensure that at least the largest probability is not zeroed out
+        all_true = torch.full_like(keep_mask[:, 0:1, :], True)
+        keep_mask = torch.cat((all_true, keep_mask), dim=1)
+        keep_mask = keep_mask[:, :-1, :]
+
+        keep_mask = keep_mask.gather(1, indices.argsort(1))
+
+        rv = log_p_x_0.clone()
+
+        rv[~keep_mask] = -torch.inf  # -inf = log(0)
+
+        return rv
diff --git a/src/diffusers/schedulers/__init__.py b/src/diffusers/schedulers/__init__.py
index a1915ed8d2b1..1be541ba8b66 100644
--- a/src/diffusers/schedulers/__init__.py
+++ b/src/diffusers/schedulers/__init__.py
@@ -28,6 +28,7 @@
     from .scheduling_sde_ve import ScoreSdeVeScheduler
     from .scheduling_sde_vp import ScoreSdeVpScheduler
     from .scheduling_utils import SchedulerMixin
+    from .scheduling_vq_diffusion import VQDiffusionScheduler
 else:
     from ..utils.dummy_pt_objects import *  # noqa F403
 
diff --git a/src/diffusers/schedulers/scheduling_vq_diffusion.py b/src/diffusers/schedulers/scheduling_vq_diffusion.py
new file mode 100644
index 000000000000..dbe320d998a3
--- /dev/null
+++ b/src/diffusers/schedulers/scheduling_vq_diffusion.py
@@ -0,0 +1,494 @@
+# Copyright 2022 Microsoft and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .scheduling_utils import SchedulerMixin
+
+
+@dataclass
+class VQDiffusionSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's step function output.
+
+    Args:
+        prev_sample (`torch.LongTensor` of shape `(batch size, num latent pixels)`):
+            Computed sample x_{t-1} of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: torch.LongTensor
+
+
+def index_to_log_onehot(x: torch.LongTensor, num_classes: int) -> torch.FloatTensor:
+    """
+    Convert batch of vector of class indices into batch of log onehot vectors
+
+    Args:
+        x (`torch.LongTensor` of shape `(batch size, vector length)`):
+            Batch of class indices
+
+        num_classes (`int`):
+            number of classes to be used for the onehot vectors
+
+    Returns:
+        `torch.FloatTensor` of shape `(batch size, num classes, vector length)`:
+            Log onehot vectors
+    """
+    x_onehot = F.one_hot(x, num_classes)
+    x_onehot = x_onehot.permute(0, 2, 1)
+    log_x = torch.log(x_onehot.float().clamp(min=1e-30))
+    return log_x
+
+
+def gumbel_noised(logits: torch.FloatTensor, generator: Optional[torch.Generator]) -> torch.FloatTensor:
+    """
+    Apply gumbel noise to `logits`
+    """
+    uniform = torch.rand(logits.shape, device=logits.device, generator=generator)
+    gumbel_noise = -torch.log(-torch.log(uniform + 1e-30) + 1e-30)
+    noised = gumbel_noise + logits
+    return noised
+
+
+def alpha_schedules(num_diffusion_timesteps: int, alpha_cum_start=0.99999, alpha_cum_end=0.000009):
+    """
+    Cumulative and non-cumulative alpha schedules.
+
+    See section 4.1.
+    """
+    att = (
+        np.arange(0, num_diffusion_timesteps) / (num_diffusion_timesteps - 1) * (alpha_cum_end - alpha_cum_start)
+        + alpha_cum_start
+    )
+    att = np.concatenate(([1], att))
+    at = att[1:] / att[:-1]
+    att = np.concatenate((att[1:], [1]))
+    return at, att
+
+
+def gamma_schedules(num_diffusion_timesteps: int, gamma_cum_start=0.000009, gamma_cum_end=0.99999):
+    """
+    Cumulative and non-cumulative gamma schedules.
+
+    See section 4.1.
+    """
+    ctt = (
+        np.arange(0, num_diffusion_timesteps) / (num_diffusion_timesteps - 1) * (gamma_cum_end - gamma_cum_start)
+        + gamma_cum_start
+    )
+    ctt = np.concatenate(([0], ctt))
+    one_minus_ctt = 1 - ctt
+    one_minus_ct = one_minus_ctt[1:] / one_minus_ctt[:-1]
+    ct = 1 - one_minus_ct
+    ctt = np.concatenate((ctt[1:], [0]))
+    return ct, ctt
+
+
+class VQDiffusionScheduler(SchedulerMixin, ConfigMixin):
+    """
+    The VQ-diffusion transformer outputs predicted probabilities of the initial unnoised image.
+
+    The VQ-diffusion scheduler converts the transformer's output into a sample for the unnoised image at the previous
+    diffusion timestep.
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+    [`~ConfigMixin.from_config`] functions.
+
+    For more details, see the original paper: https://arxiv.org/abs/2111.14822
+
+    Args:
+        num_vec_classes (`int`):
+            The number of classes of the vector embeddings of the latent pixels. Includes the class for the masked
+            latent pixel.
+
+        num_train_timesteps (`int`):
+            Number of diffusion steps used to train the model.
+
+        alpha_cum_start (`float`):
+            The starting cumulative alpha value.
+
+        alpha_cum_end (`float`):
+            The ending cumulative alpha value.
+
+        gamma_cum_start (`float`):
+            The starting cumulative gamma value.
+
+        gamma_cum_end (`float`):
+            The ending cumulative gamma value.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        num_vec_classes: int,
+        num_train_timesteps: int = 100,
+        alpha_cum_start: float = 0.99999,
+        alpha_cum_end: float = 0.000009,
+        gamma_cum_start: float = 0.000009,
+        gamma_cum_end: float = 0.99999,
+    ):
+        self.num_embed = num_vec_classes
+
+        # By convention, the index for the mask class is the last class index
+        self.mask_class = self.num_embed - 1
+
+        at, att = alpha_schedules(num_train_timesteps, alpha_cum_start=alpha_cum_start, alpha_cum_end=alpha_cum_end)
+        ct, ctt = gamma_schedules(num_train_timesteps, gamma_cum_start=gamma_cum_start, gamma_cum_end=gamma_cum_end)
+
+        num_non_mask_classes = self.num_embed - 1
+        bt = (1 - at - ct) / num_non_mask_classes
+        btt = (1 - att - ctt) / num_non_mask_classes
+
+        at = torch.tensor(at.astype("float64"))
+        bt = torch.tensor(bt.astype("float64"))
+        ct = torch.tensor(ct.astype("float64"))
+        log_at = torch.log(at)
+        log_bt = torch.log(bt)
+        log_ct = torch.log(ct)
+
+        att = torch.tensor(att.astype("float64"))
+        btt = torch.tensor(btt.astype("float64"))
+        ctt = torch.tensor(ctt.astype("float64"))
+        log_cumprod_at = torch.log(att)
+        log_cumprod_bt = torch.log(btt)
+        log_cumprod_ct = torch.log(ctt)
+
+        self.log_at = log_at.float()
+        self.log_bt = log_bt.float()
+        self.log_ct = log_ct.float()
+        self.log_cumprod_at = log_cumprod_at.float()
+        self.log_cumprod_bt = log_cumprod_bt.float()
+        self.log_cumprod_ct = log_cumprod_ct.float()
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+
+            device (`str` or `torch.device`):
+                device to place the timesteps and the diffusion process parameters (alpha, beta, gamma) on.
+        """
+        self.num_inference_steps = num_inference_steps
+        timesteps = np.arange(0, self.num_inference_steps)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+        self.log_at = self.log_at.to(device)
+        self.log_bt = self.log_bt.to(device)
+        self.log_ct = self.log_ct.to(device)
+        self.log_cumprod_at = self.log_cumprod_at.to(device)
+        self.log_cumprod_bt = self.log_cumprod_bt.to(device)
+        self.log_cumprod_ct = self.log_cumprod_ct.to(device)
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: torch.long,
+        sample: torch.LongTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[VQDiffusionSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep via the reverse transition distribution i.e. Equation (11). See the
+        docstring for `self.q_posterior` for more in depth docs on how Equation (11) is computed.
+
+        Args:
+            log_p_x_0: (`torch.FloatTensor` of shape `(batch size, num classes - 1, num latent pixels)`):
+                The log probabilities for the predicted classes of the initial latent pixels. Does not include a
+                prediction for the masked class as the initial unnoised image cannot be masked.
+
+            t (`torch.long`):
+                The timestep that determines which transition matrices are used.
+
+            x_t: (`torch.LongTensor` of shape `(batch size, num latent pixels)`):
+                The classes of each latent pixel at time `t`
+
+            generator: (`torch.Generator` or None):
+                RNG for the noise applied to p(x_{t-1} | x_t) before it is sampled from.
+
+            return_dict (`bool`):
+                option for returning tuple rather than VQDiffusionSchedulerOutput class
+
+        Returns:
+            [`~schedulers.scheduling_utils.VQDiffusionSchedulerOutput`] or `tuple`:
+            [`~schedulers.scheduling_utils.VQDiffusionSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`.
+            When returning a tuple, the first element is the sample tensor.
+        """
+        if timestep == 0:
+            log_p_x_t_min_1 = model_output
+        else:
+            log_p_x_t_min_1 = self.q_posterior(model_output, sample, timestep)
+
+        log_p_x_t_min_1 = gumbel_noised(log_p_x_t_min_1, generator)
+
+        x_t_min_1 = log_p_x_t_min_1.argmax(dim=1)
+
+        if not return_dict:
+            return (x_t_min_1,)
+
+        return VQDiffusionSchedulerOutput(prev_sample=x_t_min_1)
+
+    def q_posterior(self, log_p_x_0, x_t, t):
+        """
+        Calculates the log probabilities for the predicted classes of the image at timestep `t-1`. I.e. Equation (11).
+
+        Instead of directly computing equation (11), we use Equation (5) to restate Equation (11) in terms of only
+        forward probabilities.
+
+        Equation (11) stated in terms of forward probabilities via Equation (5):
+
+        Where:
+        - the sum is over x_0 = {C_0 ... C_{k-1}} (classes for x_0)
+
+        p(x_{t-1} | x_t) = sum( q(x_t | x_{t-1}) * q(x_{t-1} | x_0) * p(x_0) / q(x_t | x_0) )
+
+        Args:
+            log_p_x_0: (`torch.FloatTensor` of shape `(batch size, num classes - 1, num latent pixels)`):
+                The log probabilities for the predicted classes of the initial latent pixels. Does not include a
+                prediction for the masked class as the initial unnoised image cannot be masked.
+
+            x_t: (`torch.LongTensor` of shape `(batch size, num latent pixels)`):
+                The classes of each latent pixel at time `t`
+
+            t (torch.Long):
+                The timestep that determines which transition matrix is used.
+
+        Returns:
+            `torch.FloatTensor` of shape `(batch size, num classes, num latent pixels)`:
+                The log probabilities for the predicted classes of the image at timestep `t-1`. I.e. Equation (11).
+        """
+        log_onehot_x_t = index_to_log_onehot(x_t, self.num_embed)
+
+        log_q_x_t_given_x_0 = self.log_Q_t_transitioning_to_known_class(
+            t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=True
+        )
+
+        log_q_t_given_x_t_min_1 = self.log_Q_t_transitioning_to_known_class(
+            t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=False
+        )
+
+        # p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0)          ...      p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0)
+        #               .                    .                                   .
+        #               .                            .                           .
+        #               .                                      .                 .
+        # p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1})  ...      p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1})
+        q = log_p_x_0 - log_q_x_t_given_x_0
+
+        # sum_0 = p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) + ... + p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}), ... ,
+        # sum_n = p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) + ... + p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1})
+        q_log_sum_exp = torch.logsumexp(q, dim=1, keepdim=True)
+
+        # p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_0          ...      p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_n
+        #                        .                             .                                   .
+        #                        .                                     .                           .
+        #                        .                                               .                 .
+        # p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_0  ...      p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_n
+        q = q - q_log_sum_exp
+
+        # (p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1}          ...      (p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}
+        #                                         .                                                .                                              .
+        #                                         .                                                        .                                      .
+        #                                         .                                                                  .                            .
+        # (p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1}  ...      (p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}
+        # c_cumulative_{t-1}                                                                                 ...      c_cumulative_{t-1}
+        q = self.apply_cumulative_transitions(q, t - 1)
+
+        # ((p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_0) * sum_0              ...      ((p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_0) * sum_n
+        #                                                            .                                                                 .                                              .
+        #                                                            .                                                                         .                                      .
+        #                                                            .                                                                                   .                            .
+        # ((p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_{k-1}) * sum_0  ...      ((p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_{k-1}) * sum_n
+        # c_cumulative_{t-1} * q(x_t | x_{t-1}=C_k) * sum_0                                                                                       ...      c_cumulative_{t-1} * q(x_t | x_{t-1}=C_k) * sum_0
+        log_p_x_t_min_1 = q + log_q_t_given_x_t_min_1 + q_log_sum_exp
+
+        # For each column, there are two possible cases.
+        #
+        # Where:
+        # - sum(p_n(x_0))) is summing over all classes for x_0
+        # - C_i is the class transitioning from (not to be confused with c_t and c_cumulative_t being used for gamma's)
+        # - C_j is the class transitioning to
+        #
+        # 1. x_t is masked i.e. x_t = c_k
+        #
+        # Simplifying the expression, the column vector is:
+        #                                                      .
+        #                                                      .
+        #                                                      .
+        # (c_t / c_cumulative_t) * (a_cumulative_{t-1} * p_n(x_0 = C_i | x_t) + b_cumulative_{t-1} * sum(p_n(x_0)))
+        #                                                      .
+        #                                                      .
+        #                                                      .
+        # (c_cumulative_{t-1} / c_cumulative_t) * sum(p_n(x_0))
+        #
+        # From equation (11) stated in terms of forward probabilities, the last row is trivially verified.
+        #
+        # For the other rows, we can state the equation as ...
+        #
+        # (c_t / c_cumulative_t) * [b_cumulative_{t-1} * p(x_0=c_0) + ... + (a_cumulative_{t-1} + b_cumulative_{t-1}) * p(x_0=C_i) + ... + b_cumulative_{k-1} * p(x_0=c_{k-1})]
+        #
+        # This verifies the other rows.
+        #
+        # 2. x_t is not masked
+        #
+        # Simplifying the expression, there are two cases for the rows of the column vector, where C_j = C_i and where C_j != C_i:
+        #                                                      .
+        #                                                      .
+        #                                                      .
+        # C_j != C_i:        b_t * ((b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_0) + ... + ((a_cumulative_{t-1} + b_cumulative_{t-1}) / b_cumulative_t) * p_n(x_0 = C_i) + ... + (b_cumulative_{t-1} / (a_cumulative_t + b_cumulative_t)) * p_n(c_0=C_j) + ... + (b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_{k-1}))
+        #                                                      .
+        #                                                      .
+        #                                                      .
+        # C_j = C_i: (a_t + b_t) * ((b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_0) + ... + ((a_cumulative_{t-1} + b_cumulative_{t-1}) / (a_cumulative_t + b_cumulative_t)) * p_n(x_0 = C_i = C_j) + ... + (b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_{k-1}))
+        #                                                      .
+        #                                                      .
+        #                                                      .
+        # 0
+        #
+        # The last row is trivially verified. The other rows can be verified by directly expanding equation (11) stated in terms of forward probabilities.
+        return log_p_x_t_min_1
+
+    def log_Q_t_transitioning_to_known_class(
+        self, *, t: torch.int, x_t: torch.LongTensor, log_onehot_x_t: torch.FloatTensor, cumulative: bool
+    ):
+        """
+        Returns the log probabilities of the rows from the (cumulative or non-cumulative) transition matrix for each
+        latent pixel in `x_t`.
+
+        See equation (7) for the complete non-cumulative transition matrix. The complete cumulative transition matrix
+        is the same structure except the parameters (alpha, beta, gamma) are the cumulative analogs.
+
+        Args:
+            t (torch.Long):
+                The timestep that determines which transition matrix is used.
+
+            x_t (`torch.LongTensor` of shape `(batch size, num latent pixels)`):
+                The classes of each latent pixel at time `t`.
+
+            log_onehot_x_t (`torch.FloatTensor` of shape `(batch size, num classes, num latent pixels)`):
+                The log one-hot vectors of `x_t`
+
+            cumulative (`bool`):
+                If cumulative is `False`, we use the single step transition matrix `t-1`->`t`. If cumulative is `True`,
+                we use the cumulative transition matrix `0`->`t`.
+
+        Returns:
+            `torch.FloatTensor` of shape `(batch size, num classes - 1, num latent pixels)`:
+                Each _column_ of the returned matrix is a _row_ of log probabilities of the complete probability
+                transition matrix.
+
+                When non cumulative, returns `self.num_classes - 1` rows because the initial latent pixel cannot be
+                masked.
+
+                Where:
+                - `q_n` is the probability distribution for the forward process of the `n`th latent pixel.
+                - C_0 is a class of a latent pixel embedding
+                - C_k is the class of the masked latent pixel
+
+                non-cumulative result (omitting logarithms):
+                ```
+                q_0(x_t | x_{t-1} = C_0) ... q_n(x_t | x_{t-1} = C_0)
+                          .      .                     .
+                          .               .            .
+                          .                      .     .
+                q_0(x_t | x_{t-1} = C_k) ... q_n(x_t | x_{t-1} = C_k)
+                ```
+
+                cumulative result (omitting logarithms):
+                ```
+                q_0_cumulative(x_t | x_0 = C_0)    ...  q_n_cumulative(x_t | x_0 = C_0)
+                          .               .                          .
+                          .                        .                 .
+                          .                               .          .
+                q_0_cumulative(x_t | x_0 = C_{k-1}) ... q_n_cumulative(x_t | x_0 = C_{k-1})
+                ```
+        """
+        if cumulative:
+            a = self.log_cumprod_at[t]
+            b = self.log_cumprod_bt[t]
+            c = self.log_cumprod_ct[t]
+        else:
+            a = self.log_at[t]
+            b = self.log_bt[t]
+            c = self.log_ct[t]
+
+        if not cumulative:
+            # The values in the onehot vector can also be used as the logprobs for transitioning
+            # from masked latent pixels. If we are not calculating the cumulative transitions,
+            # we need to save these vectors to be re-appended to the final matrix so the values
+            # aren't overwritten.
+            #
+            # `P(x_t!=mask|x_{t-1=mask}) = 0` and 0 will be the value of the last row of the onehot vector
+            # if x_t is not masked
+            #
+            # `P(x_t=mask|x_{t-1=mask}) = 1` and 1 will be the value of the last row of the onehot vector
+            # if x_t is masked
+            log_onehot_x_t_transitioning_from_masked = log_onehot_x_t[:, -1, :].unsqueeze(1)
+
+        # `index_to_log_onehot` will add onehot vectors for masked pixels,
+        # so the default one hot matrix has one too many rows. See the doc string
+        # for an explanation of the dimensionality of the returned matrix.
+        log_onehot_x_t = log_onehot_x_t[:, :-1, :]
+
+        # this is a cheeky trick to produce the transition probabilities using log one-hot vectors.
+        #
+        # Don't worry about what values this sets in the columns that mark transitions
+        # to masked latent pixels. They are overwrote later with the `mask_class_mask`.
+        #
+        # Looking at the below logspace formula in non-logspace, each value will evaluate to either
+        # `1 * a + b = a + b` where `log_Q_t` has the one hot value in the column
+        # or
+        # `0 * a + b = b` where `log_Q_t` has the 0 values in the column.
+        #
+        # See equation 7 for more details.
+        log_Q_t = (log_onehot_x_t + a).logaddexp(b)
+
+        # The whole column of each masked pixel is `c`
+        mask_class_mask = x_t == self.mask_class
+        mask_class_mask = mask_class_mask.unsqueeze(1).expand(-1, self.num_embed - 1, -1)
+        log_Q_t[mask_class_mask] = c
+
+        if not cumulative:
+            log_Q_t = torch.cat((log_Q_t, log_onehot_x_t_transitioning_from_masked), dim=1)
+
+        return log_Q_t
+
+    def apply_cumulative_transitions(self, q, t):
+        bsz = q.shape[0]
+        a = self.log_cumprod_at[t]
+        b = self.log_cumprod_bt[t]
+        c = self.log_cumprod_ct[t]
+
+        num_latent_pixels = q.shape[2]
+        c = c.expand(bsz, 1, num_latent_pixels)
+
+        q = (q + a).logaddexp(b)
+        q = torch.cat((q, c), dim=1)
+
+        return q
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 63aa20962f05..833f2b6c5074 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -34,6 +34,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class Transformer2DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class UNet1DModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -257,6 +272,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class VQDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class DDIMScheduler(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -407,6 +437,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class VQDiffusionScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class EMAModel(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/pipelines/vq_diffusion/__init__.py b/tests/pipelines/vq_diffusion/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/vq_diffusion/test_vq_diffusion.py b/tests/pipelines/vq_diffusion/test_vq_diffusion.py
new file mode 100644
index 000000000000..5eb32d40d4f3
--- /dev/null
+++ b/tests/pipelines/vq_diffusion/test_vq_diffusion.py
@@ -0,0 +1,175 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import Transformer2DModel, VQDiffusionPipeline, VQDiffusionScheduler, VQModel
+from diffusers.utils import load_image, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class VQDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @property
+    def num_embed(self):
+        return 12
+
+    @property
+    def num_embeds_ada_norm(self):
+        return 12
+
+    @property
+    def dummy_vqvae(self):
+        torch.manual_seed(0)
+        model = VQModel(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=3,
+            num_vq_embeddings=self.num_embed,
+            vq_embed_dim=3,
+        )
+        return model
+
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModel(config)
+
+    @property
+    def dummy_transformer(self):
+        torch.manual_seed(0)
+
+        height = 12
+        width = 12
+
+        model_kwargs = {
+            "attention_bias": True,
+            "cross_attention_dim": 32,
+            "attention_head_dim": height * width,
+            "num_attention_heads": 1,
+            "num_vector_embeds": self.num_embed,
+            "num_embeds_ada_norm": self.num_embeds_ada_norm,
+            "norm_num_groups": 32,
+            "sample_size": width,
+            "activation_fn": "geglu-approximate",
+        }
+
+        model = Transformer2DModel(**model_kwargs)
+        return model
+
+    def test_vq_diffusion(self):
+        device = "cpu"
+
+        vqvae = self.dummy_vqvae
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        transformer = self.dummy_transformer
+        scheduler = VQDiffusionScheduler(self.num_embed)
+
+        pipe = VQDiffusionPipeline(
+            vqvae=vqvae, text_encoder=text_encoder, tokenizer=tokenizer, transformer=transformer, scheduler=scheduler
+        )
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "teddy bear playing in the pool"
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = pipe([prompt], generator=generator, num_inference_steps=2, output_type="np")
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = pipe(
+            [prompt], generator=generator, output_type="np", return_dict=False, num_inference_steps=2
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 24, 24, 3)
+
+        expected_slice = np.array([0.6583, 0.6410, 0.5325, 0.5635, 0.5563, 0.4234, 0.6008, 0.5491, 0.4880])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+
+@slow
+@require_torch_gpu
+class VQDiffusionPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_vq_diffusion(self):
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/vq_diffusion/teddy_bear_pool.png"
+        )
+        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+
+        pipeline = VQDiffusionPipeline.from_pretrained("microsoft/vq-diffusion-ithq")
+        pipeline = pipeline.to(torch_device)
+        pipeline.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipeline(
+            "teddy bear playing in the pool",
+            truncation_rate=0.86,
+            num_images_per_prompt=1,
+            generator=generator,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (256, 256, 3)
+        assert np.abs(expected_image - image).max() < 1e-2
diff --git a/tests/test_layers_utils.py b/tests/test_layers_utils.py
index cf531fbf3fd3..911ec548b3bd 100755
--- a/tests/test_layers_utils.py
+++ b/tests/test_layers_utils.py
@@ -18,8 +18,9 @@
 
 import numpy as np
 import torch
+from torch import nn
 
-from diffusers.models.attention import AttentionBlock, SpatialTransformer
+from diffusers.models.attention import GEGLU, AdaLayerNorm, ApproximateGELU, AttentionBlock, Transformer2DModel
 from diffusers.models.embeddings import get_timestep_embedding
 from diffusers.models.resnet import Downsample2D, Upsample2D
 from diffusers.utils import torch_device
@@ -235,7 +236,7 @@ def test_attention_block_default(self):
             num_head_channels=1,
             rescale_output_factor=1.0,
             eps=1e-6,
-            num_groups=32,
+            norm_num_groups=32,
         ).to(torch_device)
         with torch.no_grad():
             attention_scores = attentionBlock(sample)
@@ -259,7 +260,7 @@ def test_attention_block_sd(self):
             channels=512,
             rescale_output_factor=1.0,
             eps=1e-6,
-            num_groups=32,
+            norm_num_groups=32,
         ).to(torch_device)
         with torch.no_grad():
             attention_scores = attentionBlock(sample)
@@ -273,22 +274,22 @@ def test_attention_block_sd(self):
         assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
 
 
-class SpatialTransformerTests(unittest.TestCase):
+class Transformer2DModelTests(unittest.TestCase):
     def test_spatial_transformer_default(self):
         torch.manual_seed(0)
         if torch.cuda.is_available():
             torch.cuda.manual_seed_all(0)
 
         sample = torch.randn(1, 32, 64, 64).to(torch_device)
-        spatial_transformer_block = SpatialTransformer(
+        spatial_transformer_block = Transformer2DModel(
             in_channels=32,
-            n_heads=1,
-            d_head=32,
+            num_attention_heads=1,
+            attention_head_dim=32,
             dropout=0.0,
-            context_dim=None,
+            cross_attention_dim=None,
         ).to(torch_device)
         with torch.no_grad():
-            attention_scores = spatial_transformer_block(sample)
+            attention_scores = spatial_transformer_block(sample).sample
 
         assert attention_scores.shape == (1, 32, 64, 64)
         output_slice = attention_scores[0, -1, -3:, -3:]
@@ -298,22 +299,22 @@ def test_spatial_transformer_default(self):
         )
         assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
 
-    def test_spatial_transformer_context_dim(self):
+    def test_spatial_transformer_cross_attention_dim(self):
         torch.manual_seed(0)
         if torch.cuda.is_available():
             torch.cuda.manual_seed_all(0)
 
         sample = torch.randn(1, 64, 64, 64).to(torch_device)
-        spatial_transformer_block = SpatialTransformer(
+        spatial_transformer_block = Transformer2DModel(
             in_channels=64,
-            n_heads=2,
-            d_head=32,
+            num_attention_heads=2,
+            attention_head_dim=32,
             dropout=0.0,
-            context_dim=64,
+            cross_attention_dim=64,
         ).to(torch_device)
         with torch.no_grad():
             context = torch.randn(1, 4, 64).to(torch_device)
-            attention_scores = spatial_transformer_block(sample, context)
+            attention_scores = spatial_transformer_block(sample, context).sample
 
         assert attention_scores.shape == (1, 64, 64, 64)
         output_slice = attention_scores[0, -1, -3:, -3:]
@@ -323,6 +324,44 @@ def test_spatial_transformer_context_dim(self):
         )
         assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
 
+    def test_spatial_transformer_timestep(self):
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+
+        num_embeds_ada_norm = 5
+
+        sample = torch.randn(1, 64, 64, 64).to(torch_device)
+        spatial_transformer_block = Transformer2DModel(
+            in_channels=64,
+            num_attention_heads=2,
+            attention_head_dim=32,
+            dropout=0.0,
+            cross_attention_dim=64,
+            num_embeds_ada_norm=num_embeds_ada_norm,
+        ).to(torch_device)
+        with torch.no_grad():
+            timestep_1 = torch.tensor(1, dtype=torch.long).to(torch_device)
+            timestep_2 = torch.tensor(2, dtype=torch.long).to(torch_device)
+            attention_scores_1 = spatial_transformer_block(sample, timestep=timestep_1).sample
+            attention_scores_2 = spatial_transformer_block(sample, timestep=timestep_2).sample
+
+        assert attention_scores_1.shape == (1, 64, 64, 64)
+        assert attention_scores_2.shape == (1, 64, 64, 64)
+
+        output_slice_1 = attention_scores_1[0, -1, -3:, -3:]
+        output_slice_2 = attention_scores_2[0, -1, -3:, -3:]
+
+        expected_slice_1 = torch.tensor(
+            [-0.1874, -0.9704, -1.4290, -1.3357, 1.5138, 0.3036, -0.0976, -1.1667, 0.1283], device=torch_device
+        )
+        expected_slice_2 = torch.tensor(
+            [-0.3493, -1.0924, -1.6161, -1.5016, 1.4245, 0.1367, -0.2526, -1.3109, -0.0547], device=torch_device
+        )
+
+        assert torch.allclose(output_slice_1.flatten(), expected_slice_1, atol=1e-3)
+        assert torch.allclose(output_slice_2.flatten(), expected_slice_2, atol=1e-3)
+
     def test_spatial_transformer_dropout(self):
         torch.manual_seed(0)
         if torch.cuda.is_available():
@@ -330,18 +369,18 @@ def test_spatial_transformer_dropout(self):
 
         sample = torch.randn(1, 32, 64, 64).to(torch_device)
         spatial_transformer_block = (
-            SpatialTransformer(
+            Transformer2DModel(
                 in_channels=32,
-                n_heads=2,
-                d_head=16,
+                num_attention_heads=2,
+                attention_head_dim=16,
                 dropout=0.3,
-                context_dim=None,
+                cross_attention_dim=None,
             )
             .to(torch_device)
             .eval()
         )
         with torch.no_grad():
-            attention_scores = spatial_transformer_block(sample)
+            attention_scores = spatial_transformer_block(sample).sample
 
         assert attention_scores.shape == (1, 32, 64, 64)
         output_slice = attention_scores[0, -1, -3:, -3:]
@@ -350,3 +389,107 @@ def test_spatial_transformer_dropout(self):
             [-1.2448, -0.0190, -0.9471, -1.5140, 0.7069, -1.0144, -2.1077, 0.9099, -1.0091], device=torch_device
         )
         assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
+
+    @unittest.skipIf(torch_device == "mps", "MPS does not support float64")
+    def test_spatial_transformer_discrete(self):
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+
+        num_embed = 5
+
+        sample = torch.randint(0, num_embed, (1, 32)).to(torch_device)
+        spatial_transformer_block = (
+            Transformer2DModel(
+                num_attention_heads=1,
+                attention_head_dim=32,
+                num_vector_embeds=num_embed,
+                sample_size=16,
+            )
+            .to(torch_device)
+            .eval()
+        )
+
+        with torch.no_grad():
+            attention_scores = spatial_transformer_block(sample).sample
+
+        assert attention_scores.shape == (1, num_embed - 1, 32)
+
+        output_slice = attention_scores[0, -2:, -3:]
+
+        expected_slice = torch.tensor([-0.8957, -1.8370, -1.3390, -0.9152, -0.5187, -1.1702], device=torch_device)
+        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
+
+    def test_spatial_transformer_default_norm_layers(self):
+        spatial_transformer_block = Transformer2DModel(num_attention_heads=1, attention_head_dim=32, in_channels=32)
+
+        assert spatial_transformer_block.transformer_blocks[0].norm1.__class__ == nn.LayerNorm
+        assert spatial_transformer_block.transformer_blocks[0].norm2.__class__ == nn.LayerNorm
+        assert spatial_transformer_block.transformer_blocks[0].norm3.__class__ == nn.LayerNorm
+
+    def test_spatial_transformer_ada_norm_layers(self):
+        spatial_transformer_block = Transformer2DModel(
+            num_attention_heads=1,
+            attention_head_dim=32,
+            in_channels=32,
+            num_embeds_ada_norm=5,
+        )
+
+        assert spatial_transformer_block.transformer_blocks[0].norm1.__class__ == AdaLayerNorm
+        assert spatial_transformer_block.transformer_blocks[0].norm2.__class__ == AdaLayerNorm
+        assert spatial_transformer_block.transformer_blocks[0].norm3.__class__ == nn.LayerNorm
+
+    def test_spatial_transformer_default_ff_layers(self):
+        spatial_transformer_block = Transformer2DModel(
+            num_attention_heads=1,
+            attention_head_dim=32,
+            in_channels=32,
+        )
+
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__ == GEGLU
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__ == nn.Dropout
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__ == nn.Linear
+
+        dim = 32
+        inner_dim = 128
+
+        # First dimension change
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.in_features == dim
+        # NOTE: inner_dim * 2 because GEGLU
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.out_features == inner_dim * 2
+
+        # Second dimension change
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].in_features == inner_dim
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].out_features == dim
+
+    def test_spatial_transformer_geglu_approx_ff_layers(self):
+        spatial_transformer_block = Transformer2DModel(
+            num_attention_heads=1,
+            attention_head_dim=32,
+            in_channels=32,
+            activation_fn="geglu-approximate",
+        )
+
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__ == ApproximateGELU
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__ == nn.Dropout
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__ == nn.Linear
+
+        dim = 32
+        inner_dim = 128
+
+        # First dimension change
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.in_features == dim
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.out_features == inner_dim
+
+        # Second dimension change
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].in_features == inner_dim
+        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].out_features == dim
+
+    def test_spatial_transformer_attention_bias(self):
+        spatial_transformer_block = Transformer2DModel(
+            num_attention_heads=1, attention_head_dim=32, in_channels=32, attention_bias=True
+        )
+
+        assert spatial_transformer_block.transformer_blocks[0].attn1.to_q.bias is not None
+        assert spatial_transformer_block.transformer_blocks[0].attn1.to_k.bias is not None
+        assert spatial_transformer_block.transformer_blocks[0].attn1.to_v.bias is not None
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 9285eed20ff0..29186aaac99b 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -19,6 +19,7 @@
 
 import numpy as np
 import torch
+import torch.nn.functional as F
 
 from diffusers import (
     DDIMScheduler,
@@ -29,6 +30,7 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
     ScoreSdeVeScheduler,
+    VQDiffusionScheduler,
 )
 from diffusers.utils import torch_device
 
@@ -85,12 +87,18 @@ def check_over_configs(self, time_step=0, **config):
             if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
                 time_step = float(time_step)
 
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
             scheduler_config = self.get_scheduler_config(**config)
             scheduler = scheduler_class(**scheduler_config)
 
+            if scheduler_class == VQDiffusionScheduler:
+                num_vec_classes = scheduler_config["num_vec_classes"]
+                sample = self.dummy_sample(num_vec_classes)
+                model = self.dummy_model(num_vec_classes)
+                residual = model(sample, time_step)
+            else:
+                sample = self.dummy_sample
+                residual = 0.1 * sample
+
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
                 new_scheduler = scheduler_class.from_config(tmpdirname)
@@ -122,12 +130,18 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
             if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
                 time_step = float(time_step)
 
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
             scheduler_config = self.get_scheduler_config()
             scheduler = scheduler_class(**scheduler_config)
 
+            if scheduler_class == VQDiffusionScheduler:
+                num_vec_classes = scheduler_config["num_vec_classes"]
+                sample = self.dummy_sample(num_vec_classes)
+                model = self.dummy_model(num_vec_classes)
+                residual = model(sample, time_step)
+            else:
+                sample = self.dummy_sample
+                residual = 0.1 * sample
+
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
                 new_scheduler = scheduler_class.from_config(tmpdirname)
@@ -154,15 +168,21 @@ def test_from_pretrained_save_pretrained(self):
         num_inference_steps = kwargs.pop("num_inference_steps", None)
 
         for scheduler_class in self.scheduler_classes:
-            sample = self.dummy_sample
-            residual = 0.1 * sample
+            timestep = 1
+            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
+                timestep = float(timestep)
 
             scheduler_config = self.get_scheduler_config()
             scheduler = scheduler_class(**scheduler_config)
 
-            timestep = 1
-            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
-                timestep = float(timestep)
+            if scheduler_class == VQDiffusionScheduler:
+                num_vec_classes = scheduler_config["num_vec_classes"]
+                sample = self.dummy_sample(num_vec_classes)
+                model = self.dummy_model(num_vec_classes)
+                residual = model(sample, timestep)
+            else:
+                sample = self.dummy_sample
+                residual = 0.1 * sample
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
@@ -200,8 +220,14 @@ def test_step_shape(self):
             scheduler_config = self.get_scheduler_config()
             scheduler = scheduler_class(**scheduler_config)
 
-            sample = self.dummy_sample
-            residual = 0.1 * sample
+            if scheduler_class == VQDiffusionScheduler:
+                num_vec_classes = scheduler_config["num_vec_classes"]
+                sample = self.dummy_sample(num_vec_classes)
+                model = self.dummy_model(num_vec_classes)
+                residual = model(sample, timestep_0)
+            else:
+                sample = self.dummy_sample
+                residual = 0.1 * sample
 
             if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
@@ -255,8 +281,14 @@ def recursive_check(tuple_object, dict_object):
             scheduler_config = self.get_scheduler_config()
             scheduler = scheduler_class(**scheduler_config)
 
-            sample = self.dummy_sample
-            residual = 0.1 * sample
+            if scheduler_class == VQDiffusionScheduler:
+                num_vec_classes = scheduler_config["num_vec_classes"]
+                sample = self.dummy_sample(num_vec_classes)
+                model = self.dummy_model(num_vec_classes)
+                residual = model(sample, timestep)
+            else:
+                sample = self.dummy_sample
+                residual = 0.1 * sample
 
             if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
@@ -284,22 +316,26 @@ def test_scheduler_public_api(self):
         for scheduler_class in self.scheduler_classes:
             scheduler_config = self.get_scheduler_config()
             scheduler = scheduler_class(**scheduler_config)
-            self.assertTrue(
-                hasattr(scheduler, "init_noise_sigma"),
-                f"{scheduler_class} does not implement a required attribute `init_noise_sigma`",
-            )
-            self.assertTrue(
-                hasattr(scheduler, "scale_model_input"),
-                f"{scheduler_class} does not implement a required class method `scale_model_input(sample, timestep)`",
-            )
+
+            if scheduler_class != VQDiffusionScheduler:
+                self.assertTrue(
+                    hasattr(scheduler, "init_noise_sigma"),
+                    f"{scheduler_class} does not implement a required attribute `init_noise_sigma`",
+                )
+                self.assertTrue(
+                    hasattr(scheduler, "scale_model_input"),
+                    f"{scheduler_class} does not implement a required class method `scale_model_input(sample,"
+                    " timestep)`",
+                )
             self.assertTrue(
                 hasattr(scheduler, "step"),
                 f"{scheduler_class} does not implement a required class method `step(...)`",
             )
 
-            sample = self.dummy_sample
-            scaled_sample = scheduler.scale_model_input(sample, 0.0)
-            self.assertEqual(sample.shape, scaled_sample.shape)
+            if scheduler_class != VQDiffusionScheduler:
+                sample = self.dummy_sample
+                scaled_sample = scheduler.scale_model_input(sample, 0.0)
+                self.assertEqual(sample.shape, scaled_sample.shape)
 
     def test_add_noise_device(self):
         for scheduler_class in self.scheduler_classes:
@@ -1238,3 +1274,53 @@ def test_full_loop_no_noise(self):
         result_mean = torch.mean(torch.abs(sample))
 
         assert abs(result_mean.item() - 2540529) < 10
+
+
+class VQDiffusionSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (VQDiffusionScheduler,)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_vec_classes": 4097,
+            "num_train_timesteps": 100,
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def dummy_sample(self, num_vec_classes):
+        batch_size = 4
+        height = 8
+        width = 8
+
+        sample = torch.randint(0, num_vec_classes, (batch_size, height * width))
+
+        return sample
+
+    @property
+    def dummy_sample_deter(self):
+        assert False
+
+    def dummy_model(self, num_vec_classes):
+        def model(sample, t, *args):
+            batch_size, num_latent_pixels = sample.shape
+            logits = torch.rand((batch_size, num_vec_classes - 1, num_latent_pixels))
+            return_value = F.log_softmax(logits.double(), dim=1).float()
+            return return_value
+
+        return model
+
+    def test_timesteps(self):
+        for timesteps in [2, 5, 100, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_num_vec_classes(self):
+        for num_vec_classes in [5, 100, 1000, 4000]:
+            self.check_over_configs(num_vec_classes=num_vec_classes)
+
+    def test_time_indices(self):
+        for t in [0, 50, 99]:
+            self.check_over_forward(time_step=t)
+
+    def test_add_noise_device(self):
+        pass

From 7482178162b779506a54538f2cf2565c8b88c597 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Thu, 3 Nov 2022 17:25:57 +0100
Subject: [PATCH 118/269] =?UTF-8?q?default=20fast=20model=20loading=20?=
 =?UTF-8?q?=F0=9F=94=A5=20(#1115)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* make accelerate hard dep

* default fast init

* move params to cpu when device map is None

* handle device_map=None

* handle torch < 1.9

* remove device_map="auto"

* style

* add accelerate in torch extra

* remove accelerate from extras["test"]

* raise an error if torch is available but not accelerate

* update installation docs

* Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* improve defautl loading speed even further, allow disabling fats loading

* address review comments

* adapt the tests

* fix test_stable_diffusion_fast_load

* fix test_read_init

* temp fix for dummy checks

* Trigger Build

* Apply suggestions from code review

Co-authored-by: Anton Lozhkov <anton@huggingface.co>

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Anton Lozhkov <anton@huggingface.co>
---
 README.md                                     |  16 +-
 docs/source/installation.mdx                  |  40 +-
 setup.py                                      |   3 +-
 src/diffusers/__init__.py                     |   8 +
 src/diffusers/modeling_utils.py               |  47 ++-
 src/diffusers/pipeline_utils.py               |  10 +
 .../dummy_torch_and_accelerate_objects.py     | 392 ++++++++++++++++++
 tests/models/test_models_unet_1d.py           |   2 +-
 tests/models/test_models_unet_2d.py           |  28 +-
 tests/models/test_models_vae.py               |   5 +-
 .../dance_diffusion/test_dance_diffusion.py   |   6 +-
 tests/pipelines/ddim/test_ddim.py             |   4 +-
 tests/pipelines/ddpm/test_ddpm.py             |   2 +-
 tests/pipelines/karras_ve/test_karras_ve.py   |   2 +-
 .../latent_diffusion/test_latent_diffusion.py |   4 +-
 tests/pipelines/pndm/test_pndm.py             |   2 +-
 .../score_sde_ve/test_score_sde_ve.py         |   2 +-
 .../stable_diffusion/test_stable_diffusion.py |  38 +-
 .../test_stable_diffusion_img2img.py          |   6 +-
 .../test_stable_diffusion_inpaint.py          |  11 +-
 .../test_stable_diffusion_inpaint_legacy.py   |   9 +-
 tests/repo_utils/test_check_dummies.py        |   4 +-
 tests/test_pipelines.py                       |  32 +-
 23 files changed, 564 insertions(+), 109 deletions(-)
 create mode 100644 src/diffusers/utils/dummy_torch_and_accelerate_objects.py

diff --git a/README.md b/README.md
index 03365a2a5006..9c44cff2c8e2 100644
--- a/README.md
+++ b/README.md
@@ -27,10 +27,12 @@ More precisely, 🤗 Diffusers offers:
 
 ## Installation
 
+### For PyTorch
+
 **With `pip`**
     
 ```bash
-pip install --upgrade diffusers
+pip install --upgrade diffusers[torch]
 ```
 
 **With `conda`**
@@ -39,6 +41,14 @@ pip install --upgrade diffusers
 conda install -c conda-forge diffusers
 ```
 
+### For Flax
+
+**With `pip`**
+
+```bash
+pip install --upgrade diffusers[flax]
+```
+
 **Apple Silicon (M1/M2) support**
 
 Please, refer to [the documentation](https://huggingface.co/docs/diffusers/optimization/mps).
@@ -354,7 +364,7 @@ There are many ways to try running Diffusers! Here we outline code-focused tools
 If you want to run the code yourself 💻, you can try out:
 - [Text-to-Image Latent Diffusion](https://huggingface.co/CompVis/ldm-text2im-large-256)
 ```python
-# !pip install diffusers transformers
+# !pip install diffusers["torch"] transformers
 from diffusers import DiffusionPipeline
 
 device = "cuda"
@@ -373,7 +383,7 @@ image.save("squirrel.png")
 ```
 - [Unconditional Diffusion with discrete scheduler](https://huggingface.co/google/ddpm-celebahq-256)
 ```python
-# !pip install diffusers
+# !pip install diffusers["torch"]
 from diffusers import DDPMPipeline, DDIMPipeline, PNDMPipeline
 
 model_id = "google/ddpm-celebahq-256"
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 1c9460eca8bd..9c93b359560e 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -12,9 +12,12 @@ specific language governing permissions and limitations under the License.
 
 # Installation
 
-Install Diffusers for with PyTorch. Support for other libraries will come in the future
+Install 🤗 Diffusers for whichever deep learning library you’re working with.
 
-🤗 Diffusers is tested on Python 3.7+, and PyTorch 1.7.0+.
+🤗 Diffusers is tested on Python 3.7+, PyTorch 1.7.0+ and flax. Follow the installation instructions below for the deep learning library you are using:
+
+- [PyTorch](https://pytorch.org/get-started/locally/) installation instructions.
+- [Flax](https://flax.readthedocs.io/en/latest/) installation instructions.
 
 ## Install with pip
 
@@ -36,12 +39,30 @@ source .env/bin/activate
 
 Now you're ready to install 🤗 Diffusers with the following command:
 
+**For PyTorch**
+
+```bash
+pip install diffusers["torch"]
+```
+
+**For Flax**
+
 ```bash
-pip install diffusers
+pip install diffusers["flax"]
 ```
 
 ## Install from source
 
+Before intsalling `diffusers` from source, make sure you have `torch` and `accelerate` installed.
+
+For `torch` installation refer to the `torch` [docs](https://pytorch.org/get-started/locally/#start-locally).
+
+To install `accelerate`
+
+```bash
+pip install accelerate
+```
+
 Install 🤗 Diffusers from source with the following command:
 
 ```bash
@@ -67,7 +88,18 @@ Clone the repository and install 🤗 Diffusers with the following commands:
 ```bash
 git clone https://github.com/huggingface/diffusers.git
 cd diffusers
-pip install -e .
+```
+
+**For PyTorch**
+
+```
+pip install -e ".[torch]"
+```
+
+**For Flax**
+
+```
+pip install -e ".[flax]"
 ```
 
 These commands will link the folder you cloned the repository to and your Python library paths.
diff --git a/setup.py b/setup.py
index 8904242a3158..becd4cbb87a8 100644
--- a/setup.py
+++ b/setup.py
@@ -178,7 +178,6 @@ def run(self):
 extras["docs"] = deps_list("hf-doc-builder")
 extras["training"] = deps_list("accelerate", "datasets", "tensorboard", "modelcards")
 extras["test"] = deps_list(
-    "accelerate",
     "datasets",
     "parameterized",
     "pytest",
@@ -188,7 +187,7 @@ def run(self):
     "torchvision",
     "transformers"
 )
-extras["torch"] = deps_list("torch")
+extras["torch"] = deps_list("torch", "accelerate")
 
 if os.name == "nt":  # windows
     extras["flax"] = []  # jax is not supported on windows
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 00052109e32f..61ac2425db43 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -1,4 +1,5 @@
 from .utils import (
+    is_accelerate_available,
     is_flax_available,
     is_inflect_available,
     is_onnx_available,
@@ -16,6 +17,13 @@
 from .utils import logging
 
 
+# This will create an extra dummy file "dummy_torch_and_accelerate_objects.py"
+# TODO: (patil-suraj, anton-l) maybe import everything under is_torch_and_accelerate_available
+if is_torch_available() and not is_accelerate_available():
+    error_msg = "Please install the `accelerate` library to use Diffusers with PyTorch. You can do so by running `pip install diffusers[torch]`. Or if torch is already installed, you can run `pip install accelerate`."  # noqa: E501
+    raise ImportError(error_msg)
+
+
 if is_torch_available():
     from .modeling_utils import ModelMixin
     from .models import AutoencoderKL, Transformer2DModel, UNet1DModel, UNet2DConditionModel, UNet2DModel, VQModel
diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py
index c62caf80286e..f4697636719e 100644
--- a/src/diffusers/modeling_utils.py
+++ b/src/diffusers/modeling_utils.py
@@ -21,7 +21,9 @@
 import torch
 from torch import Tensor, device
 
-from diffusers.utils import is_accelerate_available
+import accelerate
+from accelerate.utils import set_module_tensor_to_device
+from accelerate.utils.versions import is_torch_version
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError
 from requests import HTTPError
@@ -268,6 +270,19 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 Mirror source to accelerate downloads in China. If you are from China and have an accessibility
                 problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
                 Please refer to the mirror site for more information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be refined to each
+                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
+                same device.
+
+                To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            fast_load (`bool`, *optional*, defaults to `True`):
+                Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
+                also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
+                model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch,
+                this argument will be ignored and the model will be loaded normally.
 
         <Tip>
 
@@ -296,6 +311,16 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         torch_dtype = kwargs.pop("torch_dtype", None)
         subfolder = kwargs.pop("subfolder", None)
         device_map = kwargs.pop("device_map", None)
+        fast_load = kwargs.pop("fast_load", True)
+
+        # Check if we can handle device_map and dispatching the weights
+        if device_map is not None and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError("Loading and dispatching requires torch >= 1.9.0")
+
+        # Fast init is only possible if torch version is >= 1.9.0
+        _INIT_EMPTY_WEIGHTS = fast_load or device_map is not None
+        if _INIT_EMPTY_WEIGHTS and not is_torch_version(">=", "1.9.0"):
+            logger.warn("Loading with `fast_load` requires torch >= 1.9.0. Falling back to normal loading.")
 
         user_agent = {
             "diffusers": __version__,
@@ -378,12 +403,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
             # restore default dtype
 
-        if device_map == "auto":
-            if is_accelerate_available():
-                import accelerate
-            else:
-                raise ImportError("Please install accelerate via `pip install accelerate`")
-
+        if _INIT_EMPTY_WEIGHTS:
+            # Instantiate model with empty weights
             with accelerate.init_empty_weights():
                 model, unused_kwargs = cls.from_config(
                     config_path,
@@ -400,7 +421,17 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     **kwargs,
                 )
 
-            accelerate.load_checkpoint_and_dispatch(model, model_file, device_map)
+            # if device_map is Non,e load the state dict on move the params from meta device to the cpu
+            if device_map is None:
+                param_device = "cpu"
+                state_dict = load_state_dict(model_file)
+                # move the parms from meta device to cpu
+                for param_name, param in state_dict.items():
+                    set_module_tensor_to_device(model, param_name, param_device, value=param)
+            else:  # else let accelerate handle loading and dispatching.
+                # Load weights and dispatch according to the device_map
+                # by deafult the device_map is None and the weights are loaded on the CPU
+                accelerate.load_checkpoint_and_dispatch(model, model_file, device_map)
 
             loading_info = {
                 "missing_keys": [],
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 4ba8d2d9300d..5c248ec1a956 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -380,6 +380,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         provider = kwargs.pop("provider", None)
         sess_options = kwargs.pop("sess_options", None)
         device_map = kwargs.pop("device_map", None)
+        fast_load = kwargs.pop("fast_load", True)
 
         # 1. Download the checkpoints and configs
         # use snapshot download here to get it working from from_pretrained
@@ -572,6 +573,15 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     and version.parse(version.parse(transformers.__version__).base_version) >= version.parse("4.20.0")
                 )
 
+                if is_diffusers_model:
+                    loading_kwargs["fast_load"] = fast_load
+
+                # When loading a transformers model, if the device_map is None, the weights will be initialized as opposed to diffusers.
+                # To make default loading faster we set the `low_cpu_mem_usage=fast_load` flag which is `True` by default.
+                # This makes sure that the weights won't be initialized which significantly speeds up loading.
+                if is_transformers_model and device_map is None:
+                    loading_kwargs["low_cpu_mem_usage"] = fast_load
+
                 if is_diffusers_model or is_transformers_model:
                     loading_kwargs["device_map"] = device_map
 
diff --git a/src/diffusers/utils/dummy_torch_and_accelerate_objects.py b/src/diffusers/utils/dummy_torch_and_accelerate_objects.py
new file mode 100644
index 000000000000..a1650f62dbad
--- /dev/null
+++ b/src/diffusers/utils/dummy_torch_and_accelerate_objects.py
@@ -0,0 +1,392 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+# flake8: noqa
+
+from ..utils import DummyObject, requires_backends
+
+
+class ModelMixin(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class AutoencoderKL(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class UNet1DModel(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class UNet2DConditionModel(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class UNet2DModel(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class VQModel(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+def get_constant_schedule(*args, **kwargs):
+    requires_backends(get_constant_schedule, ["torch", "accelerate"])
+
+
+def get_constant_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_constant_schedule_with_warmup, ["torch", "accelerate"])
+
+
+def get_cosine_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_cosine_schedule_with_warmup, ["torch", "accelerate"])
+
+
+def get_cosine_with_hard_restarts_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_cosine_with_hard_restarts_schedule_with_warmup, ["torch", "accelerate"])
+
+
+def get_linear_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_linear_schedule_with_warmup, ["torch", "accelerate"])
+
+
+def get_polynomial_decay_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_polynomial_decay_schedule_with_warmup, ["torch", "accelerate"])
+
+
+def get_scheduler(*args, **kwargs):
+    requires_backends(get_scheduler, ["torch", "accelerate"])
+
+
+class DiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class DanceDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class DDIMPipeline(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class DDPMPipeline(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class KarrasVePipeline(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class LDMPipeline(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class PNDMPipeline(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class ScoreSdeVePipeline(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class DDIMScheduler(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class DDPMScheduler(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class EulerAncestralDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class EulerDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class IPNDMScheduler(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class KarrasVeScheduler(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class PNDMScheduler(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class SchedulerMixin(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class ScoreSdeVeScheduler(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
+class EMAModel(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
diff --git a/tests/models/test_models_unet_1d.py b/tests/models/test_models_unet_1d.py
index 286c7525e2ca..c274ce419211 100644
--- a/tests/models/test_models_unet_1d.py
+++ b/tests/models/test_models_unet_1d.py
@@ -28,7 +28,7 @@ class UnetModel1DTests(unittest.TestCase):
     @slow
     def test_unet_1d_maestro(self):
         model_id = "harmonai/maestro-150k"
-        model = UNet1DModel.from_pretrained(model_id, subfolder="unet", device_map="auto")
+        model = UNet1DModel.from_pretrained(model_id, subfolder="unet")
         model.to(torch_device)
 
         sample_size = 65536
diff --git a/tests/models/test_models_unet_2d.py b/tests/models/test_models_unet_2d.py
index 20371708a4d8..feee72457769 100644
--- a/tests/models/test_models_unet_2d.py
+++ b/tests/models/test_models_unet_2d.py
@@ -125,9 +125,7 @@ def test_from_pretrained_hub(self):
 
     @unittest.skipIf(torch_device != "cuda", "This test is supposed to run on GPU")
     def test_from_pretrained_accelerate(self):
-        model, _ = UNet2DModel.from_pretrained(
-            "fusing/unet-ldm-dummy-update", output_loading_info=True, device_map="auto"
-        )
+        model, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
         model.to(torch_device)
         image = model(**self.dummy_input).sample
 
@@ -135,9 +133,8 @@ def test_from_pretrained_accelerate(self):
 
     @unittest.skipIf(torch_device != "cuda", "This test is supposed to run on GPU")
     def test_from_pretrained_accelerate_wont_change_results(self):
-        model_accelerate, _ = UNet2DModel.from_pretrained(
-            "fusing/unet-ldm-dummy-update", output_loading_info=True, device_map="auto"
-        )
+        # by defautl model loading will use accelerate as `fast_load=True`
+        model_accelerate, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
         model_accelerate.to(torch_device)
         model_accelerate.eval()
 
@@ -159,7 +156,7 @@ def test_from_pretrained_accelerate_wont_change_results(self):
         gc.collect()
 
         model_normal_load, _ = UNet2DModel.from_pretrained(
-            "fusing/unet-ldm-dummy-update", output_loading_info=True, device_map="auto"
+            "fusing/unet-ldm-dummy-update", output_loading_info=True, fast_init=False
         )
         model_normal_load.to(torch_device)
         model_normal_load.eval()
@@ -173,9 +170,8 @@ def test_memory_footprint_gets_reduced(self):
         gc.collect()
 
         tracemalloc.start()
-        model_accelerate, _ = UNet2DModel.from_pretrained(
-            "fusing/unet-ldm-dummy-update", output_loading_info=True, device_map="auto"
-        )
+        # by defautl model loading will use accelerate as `fast_load=True`
+        model_accelerate, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
         model_accelerate.to(torch_device)
         model_accelerate.eval()
         _, peak_accelerate = tracemalloc.get_traced_memory()
@@ -184,7 +180,9 @@ def test_memory_footprint_gets_reduced(self):
         torch.cuda.empty_cache()
         gc.collect()
 
-        model_normal_load, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
+        model_normal_load, _ = UNet2DModel.from_pretrained(
+            "fusing/unet-ldm-dummy-update", output_loading_info=True, fast_init=False
+        )
         model_normal_load.to(torch_device)
         model_normal_load.eval()
         _, peak_normal = tracemalloc.get_traced_memory()
@@ -348,9 +346,7 @@ def prepare_init_args_and_inputs_for_common(self):
 
     @slow
     def test_from_pretrained_hub(self):
-        model, loading_info = UNet2DModel.from_pretrained(
-            "google/ncsnpp-celebahq-256", output_loading_info=True, device_map="auto"
-        )
+        model, loading_info = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256", output_loading_info=True)
         self.assertIsNotNone(model)
         self.assertEqual(len(loading_info["missing_keys"]), 0)
 
@@ -364,7 +360,7 @@ def test_from_pretrained_hub(self):
 
     @slow
     def test_output_pretrained_ve_mid(self):
-        model = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256", device_map="auto")
+        model = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256")
         model.to(torch_device)
 
         torch.manual_seed(0)
@@ -439,7 +435,7 @@ def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"):
         torch_dtype = torch.float16 if fp16 else torch.float32
 
         model = UNet2DConditionModel.from_pretrained(
-            model_id, subfolder="unet", torch_dtype=torch_dtype, revision=revision, device_map="auto"
+            model_id, subfolder="unet", torch_dtype=torch_dtype, revision=revision
         )
         model.to(torch_device).eval()
 
diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py
index 3da7b50e34f3..169365756103 100644
--- a/tests/models/test_models_vae.py
+++ b/tests/models/test_models_vae.py
@@ -155,7 +155,10 @@ def get_sd_vae_model(self, model_id="CompVis/stable-diffusion-v1-4", fp16=False)
         torch_dtype = torch.float16 if fp16 else torch.float32
 
         model = AutoencoderKL.from_pretrained(
-            model_id, subfolder="vae", torch_dtype=torch_dtype, revision=revision, device_map="auto"
+            model_id,
+            subfolder="vae",
+            torch_dtype=torch_dtype,
+            revision=revision,
         )
         model.to(torch_device).eval()
 
diff --git a/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
index 737d1c57d154..72e67e4479d2 100644
--- a/tests/pipelines/dance_diffusion/test_dance_diffusion.py
+++ b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
@@ -86,7 +86,7 @@ def tearDown(self):
     def test_dance_diffusion(self):
         device = torch_device
 
-        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k", device_map="auto")
+        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k")
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
 
@@ -103,9 +103,7 @@ def test_dance_diffusion(self):
     def test_dance_diffusion_fp16(self):
         device = torch_device
 
-        pipe = DanceDiffusionPipeline.from_pretrained(
-            "harmonai/maestro-150k", torch_dtype=torch.float16, device_map="auto"
-        )
+        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k", torch_dtype=torch.float16)
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
 
diff --git a/tests/pipelines/ddim/test_ddim.py b/tests/pipelines/ddim/test_ddim.py
index 41280a9bb4f0..4445fe7feecf 100644
--- a/tests/pipelines/ddim/test_ddim.py
+++ b/tests/pipelines/ddim/test_ddim.py
@@ -78,7 +78,7 @@ class DDIMPipelineIntegrationTests(unittest.TestCase):
     def test_inference_ema_bedroom(self):
         model_id = "google/ddpm-ema-bedroom-256"
 
-        unet = UNet2DModel.from_pretrained(model_id, device_map="auto")
+        unet = UNet2DModel.from_pretrained(model_id)
         scheduler = DDIMScheduler.from_config(model_id)
 
         ddpm = DDIMPipeline(unet=unet, scheduler=scheduler)
@@ -97,7 +97,7 @@ def test_inference_ema_bedroom(self):
     def test_inference_cifar10(self):
         model_id = "google/ddpm-cifar10-32"
 
-        unet = UNet2DModel.from_pretrained(model_id, device_map="auto")
+        unet = UNet2DModel.from_pretrained(model_id)
         scheduler = DDIMScheduler()
 
         ddim = DDIMPipeline(unet=unet, scheduler=scheduler)
diff --git a/tests/pipelines/ddpm/test_ddpm.py b/tests/pipelines/ddpm/test_ddpm.py
index c6b24309ced0..c58e2db38f21 100644
--- a/tests/pipelines/ddpm/test_ddpm.py
+++ b/tests/pipelines/ddpm/test_ddpm.py
@@ -38,7 +38,7 @@ class DDPMPipelineIntegrationTests(unittest.TestCase):
     def test_inference_cifar10(self):
         model_id = "google/ddpm-cifar10-32"
 
-        unet = UNet2DModel.from_pretrained(model_id, device_map="auto")
+        unet = UNet2DModel.from_pretrained(model_id)
         scheduler = DDPMScheduler.from_config(model_id)
 
         ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
diff --git a/tests/pipelines/karras_ve/test_karras_ve.py b/tests/pipelines/karras_ve/test_karras_ve.py
index caca35f693cf..1fafa1cb40fa 100644
--- a/tests/pipelines/karras_ve/test_karras_ve.py
+++ b/tests/pipelines/karras_ve/test_karras_ve.py
@@ -70,7 +70,7 @@ def test_inference(self):
 class KarrasVePipelineIntegrationTests(unittest.TestCase):
     def test_inference(self):
         model_id = "google/ncsnpp-celebahq-256"
-        model = UNet2DModel.from_pretrained(model_id, device_map="auto")
+        model = UNet2DModel.from_pretrained(model_id)
         scheduler = KarrasVeScheduler()
 
         pipe = KarrasVePipeline(unet=model, scheduler=scheduler)
diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion.py b/tests/pipelines/latent_diffusion/test_latent_diffusion.py
index beb30a24f6d0..085cdb4e766b 100644
--- a/tests/pipelines/latent_diffusion/test_latent_diffusion.py
+++ b/tests/pipelines/latent_diffusion/test_latent_diffusion.py
@@ -121,7 +121,7 @@ def test_inference_text2img(self):
 @require_torch
 class LDMTextToImagePipelineIntegrationTests(unittest.TestCase):
     def test_inference_text2img(self):
-        ldm = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256", device_map="auto")
+        ldm = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256")
         ldm.to(torch_device)
         ldm.set_progress_bar_config(disable=None)
 
@@ -138,7 +138,7 @@ def test_inference_text2img(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
     def test_inference_text2img_fast(self):
-        ldm = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256", device_map="auto")
+        ldm = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256")
         ldm.to(torch_device)
         ldm.set_progress_bar_config(disable=None)
 
diff --git a/tests/pipelines/pndm/test_pndm.py b/tests/pipelines/pndm/test_pndm.py
index 02f5b1fe4fb9..5d9212223e6e 100644
--- a/tests/pipelines/pndm/test_pndm.py
+++ b/tests/pipelines/pndm/test_pndm.py
@@ -71,7 +71,7 @@ class PNDMPipelineIntegrationTests(unittest.TestCase):
     def test_inference_cifar10(self):
         model_id = "google/ddpm-cifar10-32"
 
-        unet = UNet2DModel.from_pretrained(model_id, device_map="auto")
+        unet = UNet2DModel.from_pretrained(model_id)
         scheduler = PNDMScheduler()
 
         pndm = PNDMPipeline(unet=unet, scheduler=scheduler)
diff --git a/tests/pipelines/score_sde_ve/test_score_sde_ve.py b/tests/pipelines/score_sde_ve/test_score_sde_ve.py
index e2f7bc22eca2..55dcc1cea1e0 100644
--- a/tests/pipelines/score_sde_ve/test_score_sde_ve.py
+++ b/tests/pipelines/score_sde_ve/test_score_sde_ve.py
@@ -72,7 +72,7 @@ def test_inference(self):
 class ScoreSdeVePipelineIntegrationTests(unittest.TestCase):
     def test_inference(self):
         model_id = "google/ncsnpp-church-256"
-        model = UNet2DModel.from_pretrained(model_id, device_map="auto")
+        model = UNet2DModel.from_pretrained(model_id)
 
         scheduler = ScoreSdeVeScheduler.from_config(model_id)
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index ded2470cc2e5..0f7798735585 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -631,7 +631,7 @@ def tearDown(self):
 
     def test_stable_diffusion(self):
         # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1", device_map="auto")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -653,9 +653,7 @@ def test_stable_diffusion(self):
     def test_stable_diffusion_fast_ddim(self):
         scheduler = DDIMScheduler.from_config("CompVis/stable-diffusion-v1-1", subfolder="scheduler")
 
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-1", scheduler=scheduler, device_map="auto"
-        )
+        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1", scheduler=scheduler)
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
 
@@ -674,7 +672,7 @@ def test_stable_diffusion_fast_ddim(self):
 
     def test_lms_stable_diffusion_pipeline(self):
         model_id = "CompVis/stable-diffusion-v1-1"
-        pipe = StableDiffusionPipeline.from_pretrained(model_id, device_map="auto").to(torch_device)
+        pipe = StableDiffusionPipeline.from_pretrained(model_id).to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         scheduler = LMSDiscreteScheduler.from_config(model_id, subfolder="scheduler")
         pipe.scheduler = scheduler
@@ -693,9 +691,7 @@ def test_lms_stable_diffusion_pipeline(self):
     def test_stable_diffusion_memory_chunking(self):
         torch.cuda.reset_peak_memory_stats()
         model_id = "CompVis/stable-diffusion-v1-4"
-        pipe = StableDiffusionPipeline.from_pretrained(
-            model_id, revision="fp16", torch_dtype=torch.float16, device_map="auto"
-        )
+        pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16)
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
@@ -732,9 +728,7 @@ def test_stable_diffusion_memory_chunking(self):
     def test_stable_diffusion_text2img_pipeline_fp16(self):
         torch.cuda.reset_peak_memory_stats()
         model_id = "CompVis/stable-diffusion-v1-4"
-        pipe = StableDiffusionPipeline.from_pretrained(
-            model_id, revision="fp16", device_map="auto", torch_dtype=torch.float16
-        )
+        pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
@@ -767,11 +761,7 @@ def test_stable_diffusion_text2img_pipeline(self):
         expected_image = np.array(expected_image, dtype=np.float32) / 255.0
 
         model_id = "CompVis/stable-diffusion-v1-4"
-        pipe = StableDiffusionPipeline.from_pretrained(
-            model_id,
-            safety_checker=None,
-            device_map="auto",
-        )
+        pipe = StableDiffusionPipeline.from_pretrained(model_id, safety_checker=None)
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -812,7 +802,7 @@ def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> No
         test_callback_fn.has_been_called = False
 
         pipe = StableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16, device_map="auto"
+            "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
         )
         pipe = pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
@@ -833,23 +823,23 @@ def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> No
         assert test_callback_fn.has_been_called
         assert number_of_steps == 51
 
-    def test_stable_diffusion_accelerate_auto_device(self):
+    def test_stable_diffusion_fast_load(self):
         pipeline_id = "CompVis/stable-diffusion-v1-4"
 
         start_time = time.time()
-        pipeline_normal_load = StableDiffusionPipeline.from_pretrained(
+        pipeline_fast_load = StableDiffusionPipeline.from_pretrained(
             pipeline_id, revision="fp16", torch_dtype=torch.float16
         )
-        pipeline_normal_load.to(torch_device)
-        normal_load_time = time.time() - start_time
+        pipeline_fast_load.to(torch_device)
+        fast_load_time = time.time() - start_time
 
         start_time = time.time()
         _ = StableDiffusionPipeline.from_pretrained(
-            pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto"
+            pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, fast_load=False
         )
-        meta_device_load_time = time.time() - start_time
+        normal_load_time = time.time() - start_time
 
-        assert 2 * meta_device_load_time < normal_load_time
+        assert 2 * fast_load_time < normal_load_time
 
     @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU")
     def test_stable_diffusion_pipeline_with_unet_on_gpu_only(self):
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 78d001e3c758..f5a92fac876f 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -488,7 +488,6 @@ def test_stable_diffusion_img2img_pipeline(self):
         pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
             model_id,
             safety_checker=None,
-            device_map="auto",
         )
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
@@ -529,7 +528,6 @@ def test_stable_diffusion_img2img_pipeline_k_lms(self):
             model_id,
             scheduler=lms,
             safety_checker=None,
-            device_map="auto",
         )
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
@@ -581,7 +579,9 @@ def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> No
         init_image = init_image.resize((768, 512))
 
         pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16, device_map="auto"
+            "CompVis/stable-diffusion-v1-4",
+            revision="fp16",
+            torch_dtype=torch.float16,
         )
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index f5a8b3cf9ecc..44a7a3249a7e 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -284,11 +284,7 @@ def test_stable_diffusion_inpaint_pipeline(self):
         )
 
         model_id = "runwayml/stable-diffusion-inpainting"
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            model_id,
-            safety_checker=None,
-            device_map="auto",
-        )
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None)
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -328,7 +324,6 @@ def test_stable_diffusion_inpaint_pipeline_fp16(self):
             revision="fp16",
             torch_dtype=torch.float16,
             safety_checker=None,
-            device_map="auto",
         )
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
@@ -365,9 +360,7 @@ def test_stable_diffusion_inpaint_pipeline_pndm(self):
 
         model_id = "runwayml/stable-diffusion-inpainting"
         pndm = PNDMScheduler.from_config(model_id, subfolder="scheduler")
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            model_id, safety_checker=None, scheduler=pndm, device_map="auto"
-        )
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None, scheduler=pndm)
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
index 81deba67f274..c5b2572fb79a 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
@@ -364,11 +364,7 @@ def test_stable_diffusion_inpaint_legacy_pipeline(self):
         )
 
         model_id = "CompVis/stable-diffusion-v1-4"
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            model_id,
-            safety_checker=None,
-            device_map="auto",
-        )
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None)
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
         pipe.enable_attention_slicing()
@@ -411,7 +407,6 @@ def test_stable_diffusion_inpaint_legacy_pipeline_k_lms(self):
             model_id,
             scheduler=lms,
             safety_checker=None,
-            device_map="auto",
         )
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
@@ -468,7 +463,7 @@ def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> No
         )
 
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16, device_map="auto"
+            "CompVis/stable-diffusion-v1-4", revision="fp16", torch_dtype=torch.float16
         )
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
diff --git a/tests/repo_utils/test_check_dummies.py b/tests/repo_utils/test_check_dummies.py
index d8fa9ce10547..0331b5e8c2cc 100644
--- a/tests/repo_utils/test_check_dummies.py
+++ b/tests/repo_utils/test_check_dummies.py
@@ -52,13 +52,13 @@ def test_find_backend(self):
     def test_read_init(self):
         objects = read_init()
         # We don't assert on the exact list of keys to allow for smooth grow of backend-specific objects
-        self.assertIn("torch", objects)
+        self.assertIn("torch_and_accelerate", objects)
         self.assertIn("torch_and_transformers", objects)
         self.assertIn("flax_and_transformers", objects)
         self.assertIn("torch_and_transformers_and_onnx", objects)
 
         # Likewise, we can't assert on the exact content of a key
-        self.assertIn("UNet2DModel", objects["torch"])
+        self.assertIn("UNet2DModel", objects["torch_and_accelerate"])
         self.assertIn("FlaxUNet2DConditionModel", objects["flax"])
         self.assertIn("StableDiffusionPipeline", objects["torch_and_transformers"])
         self.assertIn("FlaxStableDiffusionPipeline", objects["flax_and_transformers"])
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 1654518f1e93..b8316075fa93 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -128,7 +128,7 @@ def test_local_custom_pipeline(self):
     def test_load_pipeline_from_git(self):
         clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
 
-        feature_extractor = CLIPFeatureExtractor.from_pretrained(clip_model_id, device_map="auto")
+        feature_extractor = CLIPFeatureExtractor.from_pretrained(clip_model_id)
         clip_model = CLIPModel.from_pretrained(clip_model_id, torch_dtype=torch.float16)
 
         pipeline = DiffusionPipeline.from_pretrained(
@@ -138,7 +138,6 @@ def test_load_pipeline_from_git(self):
             feature_extractor=feature_extractor,
             torch_dtype=torch.float16,
             revision="fp16",
-            device_map="auto",
         )
         pipeline.enable_attention_slicing()
         pipeline = pipeline.to(torch_device)
@@ -333,9 +332,7 @@ def tearDown(self):
     def test_smart_download(self):
         model_id = "hf-internal-testing/unet-pipeline-dummy"
         with tempfile.TemporaryDirectory() as tmpdirname:
-            _ = DiffusionPipeline.from_pretrained(
-                model_id, cache_dir=tmpdirname, force_download=True, device_map="auto"
-            )
+            _ = DiffusionPipeline.from_pretrained(model_id, cache_dir=tmpdirname, force_download=True)
             local_repo_name = "--".join(["models"] + model_id.split("/"))
             snapshot_dir = os.path.join(tmpdirname, local_repo_name, "snapshots")
             snapshot_dir = os.path.join(snapshot_dir, os.listdir(snapshot_dir)[0])
@@ -359,7 +356,10 @@ def test_warning_unused_kwargs(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
             with CaptureLogger(logger) as cap_logger:
                 DiffusionPipeline.from_pretrained(
-                    model_id, not_used=True, cache_dir=tmpdirname, force_download=True, device_map="auto"
+                    model_id,
+                    not_used=True,
+                    cache_dir=tmpdirname,
+                    force_download=True,
                 )
 
         assert cap_logger.out == "Keyword arguments {'not_used': True} not recognized.\n"
@@ -383,7 +383,7 @@ def test_from_pretrained_save_pretrained(self):
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             ddpm.save_pretrained(tmpdirname)
-            new_ddpm = DDPMPipeline.from_pretrained(tmpdirname, device_map="auto")
+            new_ddpm = DDPMPipeline.from_pretrained(tmpdirname)
             new_ddpm.to(torch_device)
 
         generator = torch.manual_seed(0)
@@ -399,11 +399,11 @@ def test_from_pretrained_hub(self):
 
         scheduler = DDPMScheduler(num_train_timesteps=10)
 
-        ddpm = DDPMPipeline.from_pretrained(model_path, scheduler=scheduler, device_map="auto")
+        ddpm = DDPMPipeline.from_pretrained(model_path, scheduler=scheduler)
         ddpm = ddpm.to(torch_device)
         ddpm.set_progress_bar_config(disable=None)
 
-        ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler, device_map="auto")
+        ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler)
         ddpm_from_hub = ddpm_from_hub.to(torch_device)
         ddpm_from_hub.set_progress_bar_config(disable=None)
 
@@ -421,14 +421,12 @@ def test_from_pretrained_hub_pass_model(self):
         scheduler = DDPMScheduler(num_train_timesteps=10)
 
         # pass unet into DiffusionPipeline
-        unet = UNet2DModel.from_pretrained(model_path, device_map="auto")
-        ddpm_from_hub_custom_model = DiffusionPipeline.from_pretrained(
-            model_path, unet=unet, scheduler=scheduler, device_map="auto"
-        )
+        unet = UNet2DModel.from_pretrained(model_path)
+        ddpm_from_hub_custom_model = DiffusionPipeline.from_pretrained(model_path, unet=unet, scheduler=scheduler)
         ddpm_from_hub_custom_model = ddpm_from_hub_custom_model.to(torch_device)
         ddpm_from_hub_custom_model.set_progress_bar_config(disable=None)
 
-        ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler, device_map="auto")
+        ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler)
         ddpm_from_hub = ddpm_from_hub.to(torch_device)
         ddpm_from_hub_custom_model.set_progress_bar_config(disable=None)
 
@@ -443,7 +441,7 @@ def test_from_pretrained_hub_pass_model(self):
     def test_output_format(self):
         model_path = "google/ddpm-cifar10-32"
 
-        pipe = DDIMPipeline.from_pretrained(model_path, device_map="auto")
+        pipe = DDIMPipeline.from_pretrained(model_path)
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
@@ -467,7 +465,7 @@ def test_output_format(self):
     def test_ddpm_ddim_equality(self, seed):
         model_id = "google/ddpm-cifar10-32"
 
-        unet = UNet2DModel.from_pretrained(model_id, device_map="auto")
+        unet = UNet2DModel.from_pretrained(model_id)
         ddpm_scheduler = DDPMScheduler()
         ddim_scheduler = DDIMScheduler()
 
@@ -498,7 +496,7 @@ def test_ddpm_ddim_equality(self, seed):
     def test_ddpm_ddim_equality_batched(self, seed):
         model_id = "google/ddpm-cifar10-32"
 
-        unet = UNet2DModel.from_pretrained(model_id, device_map="auto")
+        unet = UNet2DModel.from_pretrained(model_id)
         ddpm_scheduler = DDPMScheduler()
         ddim_scheduler = DDIMScheduler()
 

From 988c82227db1a41846a9aae5c83750dcfc334f66 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 3 Nov 2022 17:32:39 +0100
Subject: [PATCH 119/269] fix copies

---
 .../dummy_torch_and_accelerate_objects.py     | 75 +++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/src/diffusers/utils/dummy_torch_and_accelerate_objects.py b/src/diffusers/utils/dummy_torch_and_accelerate_objects.py
index a1650f62dbad..c2c423eeab26 100644
--- a/src/diffusers/utils/dummy_torch_and_accelerate_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_accelerate_objects.py
@@ -34,6 +34,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "accelerate"])
 
 
+class Transformer2DModel(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
 class UNet1DModel(metaclass=DummyObject):
     _backends = ["torch", "accelerate"]
 
@@ -227,6 +242,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "accelerate"])
 
 
+class RePaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
 class ScoreSdeVePipeline(metaclass=DummyObject):
     _backends = ["torch", "accelerate"]
 
@@ -242,6 +272,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "accelerate"])
 
 
+class VQDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
 class DDIMScheduler(metaclass=DummyObject):
     _backends = ["torch", "accelerate"]
 
@@ -347,6 +392,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "accelerate"])
 
 
+class RePaintScheduler(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
 class SchedulerMixin(metaclass=DummyObject):
     _backends = ["torch", "accelerate"]
 
@@ -377,6 +437,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "accelerate"])
 
 
+class VQDiffusionScheduler(metaclass=DummyObject):
+    _backends = ["torch", "accelerate"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "accelerate"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "accelerate"])
+
+
 class EMAModel(metaclass=DummyObject):
     _backends = ["torch", "accelerate"]
 

From 42bb459457d77d6185f74cbc32f2a08b08876af5 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 3 Nov 2022 18:11:18 +0100
Subject: [PATCH 120/269] [Low cpu memory] Correct naming and improve default
 usage (#1122)

* correct naming

* finish

* Apply suggestions from code review

* Apply suggestions from code review

Co-authored-by: Suraj Patil <surajp815@gmail.com>

Co-authored-by: Suraj Patil <surajp815@gmail.com>
---
 src/diffusers/modeling_utils.py               | 34 ++++++++++----
 src/diffusers/pipeline_utils.py               | 44 +++++++++++++++----
 tests/models/test_models_unet_2d.py           |  8 ++--
 .../stable_diffusion/test_stable_diffusion.py | 12 ++---
 4 files changed, 71 insertions(+), 27 deletions(-)

diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py
index f4697636719e..9e05672bf163 100644
--- a/src/diffusers/modeling_utils.py
+++ b/src/diffusers/modeling_utils.py
@@ -35,6 +35,12 @@
 logger = logging.get_logger(__name__)
 
 
+if is_torch_version(">=", "1.9.0"):
+    _LOW_CPU_MEM_USAGE_DEFAULT = True
+else:
+    _LOW_CPU_MEM_USAGE_DEFAULT = False
+
+
 def get_parameter_device(parameter: torch.nn.Module):
     try:
         return next(parameter.parameters()).device
@@ -278,11 +284,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
                 more information about each option see [designing a device
                 map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
-            fast_load (`bool`, *optional*, defaults to `True`):
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
                 Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
                 also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
                 model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch,
-                this argument will be ignored and the model will be loaded normally.
+                setting this argument to `True` will raise an error.
 
         <Tip>
 
@@ -311,16 +317,26 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         torch_dtype = kwargs.pop("torch_dtype", None)
         subfolder = kwargs.pop("subfolder", None)
         device_map = kwargs.pop("device_map", None)
-        fast_load = kwargs.pop("fast_load", True)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
 
         # Check if we can handle device_map and dispatching the weights
         if device_map is not None and not is_torch_version(">=", "1.9.0"):
-            raise NotImplementedError("Loading and dispatching requires torch >= 1.9.0")
+            raise NotImplementedError(
+                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `device_map=None`."
+            )
 
-        # Fast init is only possible if torch version is >= 1.9.0
-        _INIT_EMPTY_WEIGHTS = fast_load or device_map is not None
-        if _INIT_EMPTY_WEIGHTS and not is_torch_version(">=", "1.9.0"):
-            logger.warn("Loading with `fast_load` requires torch >= 1.9.0. Falling back to normal loading.")
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+
+        if low_cpu_mem_usage is False and device_map is not None:
+            raise ValueError(
+                f"You cannot set `low_cpu_mem_usage` to `False` while using device_map={device_map} for loading and"
+                " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
+            )
 
         user_agent = {
             "diffusers": __version__,
@@ -403,7 +419,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
             # restore default dtype
 
-        if _INIT_EMPTY_WEIGHTS:
+        if low_cpu_mem_usage:
             # Instantiate model with empty weights
             with accelerate.init_empty_weights():
                 model, unused_kwargs = cls.from_config(
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 5c248ec1a956..36c2d5b888ef 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -25,6 +25,7 @@
 
 import diffusers
 import PIL
+from accelerate.utils.versions import is_torch_version
 from huggingface_hub import snapshot_download
 from packaging import version
 from PIL import Image
@@ -33,6 +34,7 @@
 from .configuration_utils import ConfigMixin
 from .dynamic_modules_utils import get_class_from_dynamic_module
 from .hub_utils import http_user_agent
+from .modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT
 from .schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from .utils import (
     CONFIG_NAME,
@@ -328,6 +330,19 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 Mirror source to accelerate downloads in China. If you are from China and have an accessibility
                 problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
                 Please refer to the mirror site for more information. specify the folder name here.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be refined to each
+                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
+                same device.
+
+                To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
+                also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
+                model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch,
+                setting this argument to `True` will raise an error.
 
             kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to overwrite load - and saveable variables - *i.e.* the pipeline components - of the
@@ -380,7 +395,25 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         provider = kwargs.pop("provider", None)
         sess_options = kwargs.pop("sess_options", None)
         device_map = kwargs.pop("device_map", None)
-        fast_load = kwargs.pop("fast_load", True)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+
+        if device_map is not None and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `device_map=None`."
+            )
+
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+
+        if low_cpu_mem_usage is False and device_map is not None:
+            raise ValueError(
+                f"You cannot set `low_cpu_mem_usage` to False while using device_map={device_map} for loading and"
+                " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
+            )
 
         # 1. Download the checkpoints and configs
         # use snapshot download here to get it working from from_pretrained
@@ -573,17 +606,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     and version.parse(version.parse(transformers.__version__).base_version) >= version.parse("4.20.0")
                 )
 
-                if is_diffusers_model:
-                    loading_kwargs["fast_load"] = fast_load
-
                 # When loading a transformers model, if the device_map is None, the weights will be initialized as opposed to diffusers.
-                # To make default loading faster we set the `low_cpu_mem_usage=fast_load` flag which is `True` by default.
+                # To make default loading faster we set the `low_cpu_mem_usage=low_cpu_mem_usage` flag which is `True` by default.
                 # This makes sure that the weights won't be initialized which significantly speeds up loading.
-                if is_transformers_model and device_map is None:
-                    loading_kwargs["low_cpu_mem_usage"] = fast_load
-
                 if is_diffusers_model or is_transformers_model:
                     loading_kwargs["device_map"] = device_map
+                    loading_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage
 
                 # check if the module is in a subdirectory
                 if os.path.isdir(os.path.join(cached_folder, name)):
diff --git a/tests/models/test_models_unet_2d.py b/tests/models/test_models_unet_2d.py
index feee72457769..71ddf1a13414 100644
--- a/tests/models/test_models_unet_2d.py
+++ b/tests/models/test_models_unet_2d.py
@@ -133,7 +133,7 @@ def test_from_pretrained_accelerate(self):
 
     @unittest.skipIf(torch_device != "cuda", "This test is supposed to run on GPU")
     def test_from_pretrained_accelerate_wont_change_results(self):
-        # by defautl model loading will use accelerate as `fast_load=True`
+        # by defautl model loading will use accelerate as `low_cpu_mem_usage=True`
         model_accelerate, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
         model_accelerate.to(torch_device)
         model_accelerate.eval()
@@ -156,7 +156,7 @@ def test_from_pretrained_accelerate_wont_change_results(self):
         gc.collect()
 
         model_normal_load, _ = UNet2DModel.from_pretrained(
-            "fusing/unet-ldm-dummy-update", output_loading_info=True, fast_init=False
+            "fusing/unet-ldm-dummy-update", output_loading_info=True, low_cpu_mem_usage=False
         )
         model_normal_load.to(torch_device)
         model_normal_load.eval()
@@ -170,7 +170,7 @@ def test_memory_footprint_gets_reduced(self):
         gc.collect()
 
         tracemalloc.start()
-        # by defautl model loading will use accelerate as `fast_load=True`
+        # by defautl model loading will use accelerate as `low_cpu_mem_usage=True`
         model_accelerate, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
         model_accelerate.to(torch_device)
         model_accelerate.eval()
@@ -181,7 +181,7 @@ def test_memory_footprint_gets_reduced(self):
         gc.collect()
 
         model_normal_load, _ = UNet2DModel.from_pretrained(
-            "fusing/unet-ldm-dummy-update", output_loading_info=True, fast_init=False
+            "fusing/unet-ldm-dummy-update", output_loading_info=True, low_cpu_mem_usage=False
         )
         model_normal_load.to(torch_device)
         model_normal_load.eval()
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 0f7798735585..b01094a607ae 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -823,23 +823,23 @@ def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> No
         assert test_callback_fn.has_been_called
         assert number_of_steps == 51
 
-    def test_stable_diffusion_fast_load(self):
+    def test_stable_diffusion_low_cpu_mem_usage(self):
         pipeline_id = "CompVis/stable-diffusion-v1-4"
 
         start_time = time.time()
-        pipeline_fast_load = StableDiffusionPipeline.from_pretrained(
+        pipeline_low_cpu_mem_usage = StableDiffusionPipeline.from_pretrained(
             pipeline_id, revision="fp16", torch_dtype=torch.float16
         )
-        pipeline_fast_load.to(torch_device)
-        fast_load_time = time.time() - start_time
+        pipeline_low_cpu_mem_usage.to(torch_device)
+        low_cpu_mem_usage_time = time.time() - start_time
 
         start_time = time.time()
         _ = StableDiffusionPipeline.from_pretrained(
-            pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, fast_load=False
+            pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, low_cpu_mem_usage=False
         )
         normal_load_time = time.time() - start_time
 
-        assert 2 * fast_load_time < normal_load_time
+        assert 2 * low_cpu_mem_usage_time < normal_load_time
 
     @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU")
     def test_stable_diffusion_pipeline_with_unet_on_gpu_only(self):

From 7b030a7d68888ab08cbec3d7a9c371f1568c219b Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Thu, 3 Nov 2022 18:13:18 +0100
Subject: [PATCH 121/269] handle device for randn in euler step (#1124)

* handle device for randn in euler step

* convert device to str
---
 .../schedulers/scheduling_euler_ancestral_discrete.py | 11 ++++++++++-
 src/diffusers/schedulers/scheduling_euler_discrete.py | 11 ++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
index 3fe52a498044..134b45a73b08 100644
--- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -217,7 +217,16 @@ def step(
         prev_sample = sample + derivative * dt
 
         device = model_output.device if torch.is_tensor(model_output) else "cpu"
-        noise = torch.randn(model_output.shape, dtype=model_output.dtype, generator=generator).to(device)
+        if str(device) == "mps":
+            # randn does not work reproducibly on mps
+            noise = torch.randn(model_output.shape, dtype=model_output.dtype, device="cpu", generator=generator).to(
+                device
+            )
+        else:
+            noise = torch.randn(model_output.shape, dtype=model_output.dtype, device=device, generator=generator).to(
+                device
+            )
+
         prev_sample = prev_sample + noise * sigma_up
 
         if not return_dict:
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 93aeb8cc3865..6425072ac3c9 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -214,7 +214,16 @@ def step(
         gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
 
         device = model_output.device if torch.is_tensor(model_output) else "cpu"
-        noise = torch.randn(model_output.shape, dtype=model_output.dtype, generator=generator).to(device)
+        if str(device) == "mps":
+            # randn does not work reproducibly on mps
+            noise = torch.randn(model_output.shape, dtype=model_output.dtype, device="cpu", generator=generator).to(
+                device
+            )
+        else:
+            noise = torch.randn(model_output.shape, dtype=model_output.dtype, device=device, generator=generator).to(
+                device
+            )
+
         eps = noise * s_noise
         sigma_hat = sigma * (gamma + 1)
 

From 118c5be94a2b8eb90fa41a2ceb59b3a8de9e0218 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Thu, 3 Nov 2022 18:17:23 +0100
Subject: [PATCH 122/269] Docs: Do not require PyTorch nightlies (#1123)

Do not require PyTorch nightlies.
---
 docs/source/optimization/mps.mdx | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/docs/source/optimization/mps.mdx b/docs/source/optimization/mps.mdx
index 4eeabc658737..8a2d5ad763a2 100644
--- a/docs/source/optimization/mps.mdx
+++ b/docs/source/optimization/mps.mdx
@@ -19,11 +19,8 @@ specific language governing permissions and limitations under the License.
 - Mac computer with Apple silicon (M1/M2) hardware.
 - macOS 12.6 or later (13.0 or later recommended).
 - arm64 version of Python.
-- PyTorch 1.13.0 RC (Release Candidate). You can install it with `pip` using:
+- PyTorch 1.13. You can install it with `pip` or `conda` using the instructions in https://pytorch.org/get-started/locally/.
 
-```
-pip3 install --pre torch --extra-index-url https://download.pytorch.org/whl/test/cpu
-```
 
 ## Inference Pipeline
 
@@ -63,4 +60,4 @@ pipeline.enable_attention_slicing()
 ## Known Issues
 
 - As mentioned above, we are investigating a strange [first-time inference issue](https://github.com/huggingface/diffusers/issues/372).
-- Generating multiple prompts in a batch [crashes or doesn't work reliably](https://github.com/huggingface/diffusers/issues/363). We believe this is related to the [`mps` backend in PyTorch](https://github.com/pytorch/pytorch/issues/84039). For now, we recommend to iterate instead of batching.
+- Generating multiple prompts in a batch [crashes or doesn't work reliably](https://github.com/huggingface/diffusers/issues/363). We believe this is related to the [`mps` backend in PyTorch](https://github.com/pytorch/pytorch/issues/84039). This is being resolved, but for now we recommend to iterate instead of batching.

From 1578679ff4a4ff8157214081438aa7d78f13b4fc Mon Sep 17 00:00:00 2001
From: anton-l <anton@huggingface.co>
Date: Thu, 3 Nov 2022 18:47:20 +0100
Subject: [PATCH 123/269] Release: v0.7.0

---
 setup.py                  | 2 +-
 src/diffusers/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index becd4cbb87a8..16b64fe81ac3 100644
--- a/setup.py
+++ b/setup.py
@@ -210,7 +210,7 @@ def run(self):
 
 setup(
     name="diffusers",
-    version="0.7.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="0.7.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     description="Diffusers",
     long_description=open("README.md", "r", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 61ac2425db43..3d268128a7df 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -10,7 +10,7 @@
 )
 
 
-__version__ = "0.7.0.dev0"
+__version__ = "0.7.0"
 
 from .configuration_utils import ConfigMixin
 from .onnx_utils import OnnxRuntimeModel

From 33108bfa6b13a3b0c279871678eb07f184fd28e9 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 3 Nov 2022 17:54:48 +0000
Subject: [PATCH 124/269] Correct VQDiffusion Pipeline import

---
 ...3c81220be0a04e4543e6fd9b0f290547749cc06cfb | 324 ++++++++++++++++++
 ...20be0a04e4543e6fd9b0f290547749cc06cfb.json |   1 +
 .../bbbcb9f65616524d6199fa3bc16dc0500fb2cbbb  | 102 ++++++
 .../refs/main                                 |   1 +
 .../pipeline.py                               |   1 +
 src/diffusers/__init__.py                     |   2 +-
 .../dummy_torch_and_accelerate_objects.py     |  15 -
 .../dummy_torch_and_transformers_objects.py   |  15 +
 8 files changed, 445 insertions(+), 16 deletions(-)
 create mode 100644 clip_guided_stable_diffusion/72392adcdf265e793b0dc13d166393a9d1367724bb03f6faca8cfb1c91c30827.8d4a13da440f0a37b6d42d3c81220be0a04e4543e6fd9b0f290547749cc06cfb
 create mode 100644 clip_guided_stable_diffusion/72392adcdf265e793b0dc13d166393a9d1367724bb03f6faca8cfb1c91c30827.8d4a13da440f0a37b6d42d3c81220be0a04e4543e6fd9b0f290547749cc06cfb.json
 create mode 100644 hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/blobs/bbbcb9f65616524d6199fa3bc16dc0500fb2cbbb
 create mode 100644 hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/refs/main
 create mode 120000 hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/snapshots/b8fa12635e53eebebc22f95ee863e7af4fc2fb07/pipeline.py

diff --git a/clip_guided_stable_diffusion/72392adcdf265e793b0dc13d166393a9d1367724bb03f6faca8cfb1c91c30827.8d4a13da440f0a37b6d42d3c81220be0a04e4543e6fd9b0f290547749cc06cfb b/clip_guided_stable_diffusion/72392adcdf265e793b0dc13d166393a9d1367724bb03f6faca8cfb1c91c30827.8d4a13da440f0a37b6d42d3c81220be0a04e4543e6fd9b0f290547749cc06cfb
new file mode 100644
index 000000000000..2c86e9130fdc
--- /dev/null
+++ b/clip_guided_stable_diffusion/72392adcdf265e793b0dc13d166393a9d1367724bb03f6faca8cfb1c91c30827.8d4a13da440f0a37b6d42d3c81220be0a04e4543e6fd9b0f290547749cc06cfb
@@ -0,0 +1,324 @@
+import inspect
+from typing import List, Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from diffusers import AutoencoderKL, DiffusionPipeline, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
+from torchvision import transforms
+from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextModel, CLIPTokenizer
+
+
+class MakeCutouts(nn.Module):
+    def __init__(self, cut_size, cut_power=1.0):
+        super().__init__()
+
+        self.cut_size = cut_size
+        self.cut_power = cut_power
+
+    def forward(self, pixel_values, num_cutouts):
+        sideY, sideX = pixel_values.shape[2:4]
+        max_size = min(sideX, sideY)
+        min_size = min(sideX, sideY, self.cut_size)
+        cutouts = []
+        for _ in range(num_cutouts):
+            size = int(torch.rand([]) ** self.cut_power * (max_size - min_size) + min_size)
+            offsetx = torch.randint(0, sideX - size + 1, ())
+            offsety = torch.randint(0, sideY - size + 1, ())
+            cutout = pixel_values[:, :, offsety : offsety + size, offsetx : offsetx + size]
+            cutouts.append(F.adaptive_avg_pool2d(cutout, self.cut_size))
+        return torch.cat(cutouts)
+
+
+def spherical_dist_loss(x, y):
+    x = F.normalize(x, dim=-1)
+    y = F.normalize(y, dim=-1)
+    return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2)
+
+
+def set_requires_grad(model, value):
+    for param in model.parameters():
+        param.requires_grad = value
+
+
+class CLIPGuidedStableDiffusion(DiffusionPipeline):
+    """CLIP guided stable diffusion based on the amazing repo by @crowsonkb and @Jack000
+    - https://github.com/Jack000/glid-3-xl
+    - https://github.dev/crowsonkb/k-diffusion
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        clip_model: CLIPModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[PNDMScheduler, LMSDiscreteScheduler],
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            clip_model=clip_model,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+
+        self.normalize = transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
+        self.make_cutouts = MakeCutouts(feature_extractor.size)
+
+        set_requires_grad(self.text_encoder, False)
+        set_requires_grad(self.clip_model, False)
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        self.enable_attention_slicing(None)
+
+    def freeze_vae(self):
+        set_requires_grad(self.vae, False)
+
+    def unfreeze_vae(self):
+        set_requires_grad(self.vae, True)
+
+    def freeze_unet(self):
+        set_requires_grad(self.unet, False)
+
+    def unfreeze_unet(self):
+        set_requires_grad(self.unet, True)
+
+    @torch.enable_grad()
+    def cond_fn(
+        self,
+        latents,
+        timestep,
+        index,
+        text_embeddings,
+        noise_pred_original,
+        text_embeddings_clip,
+        clip_guidance_scale,
+        num_cutouts,
+        use_cutouts=True,
+    ):
+        latents = latents.detach().requires_grad_()
+
+        if isinstance(self.scheduler, LMSDiscreteScheduler):
+            sigma = self.scheduler.sigmas[index]
+            # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
+            latent_model_input = latents / ((sigma**2 + 1) ** 0.5)
+        else:
+            latent_model_input = latents
+
+        # predict the noise residual
+        noise_pred = self.unet(latent_model_input, timestep, encoder_hidden_states=text_embeddings).sample
+
+        if isinstance(self.scheduler, PNDMScheduler):
+            alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
+            beta_prod_t = 1 - alpha_prod_t
+            # compute predicted original sample from predicted noise also called
+            # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+            pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
+
+            fac = torch.sqrt(beta_prod_t)
+            sample = pred_original_sample * (fac) + latents * (1 - fac)
+        elif isinstance(self.scheduler, LMSDiscreteScheduler):
+            sigma = self.scheduler.sigmas[index]
+            sample = latents - sigma * noise_pred
+        else:
+            raise ValueError(f"scheduler type {type(self.scheduler)} not supported")
+
+        sample = 1 / 0.18215 * sample
+        image = self.vae.decode(sample).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        if use_cutouts:
+            image = self.make_cutouts(image, num_cutouts)
+        else:
+            image = transforms.Resize(self.feature_extractor.size)(image)
+        image = self.normalize(image).to(latents.dtype)
+
+        image_embeddings_clip = self.clip_model.get_image_features(image)
+        image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, dim=-1, keepdim=True)
+
+        if use_cutouts:
+            dists = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip)
+            dists = dists.view([num_cutouts, sample.shape[0], -1])
+            loss = dists.sum(2).mean(0).sum() * clip_guidance_scale
+        else:
+            loss = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip).mean() * clip_guidance_scale
+
+        grads = -torch.autograd.grad(loss, latents)[0]
+
+        if isinstance(self.scheduler, LMSDiscreteScheduler):
+            latents = latents.detach() + grads * (sigma**2)
+            noise_pred = noise_pred_original
+        else:
+            noise_pred = noise_pred_original - torch.sqrt(beta_prod_t) * grads
+        return noise_pred, latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        clip_guidance_scale: Optional[float] = 100,
+        clip_prompt: Optional[Union[str, List[str]]] = None,
+        num_cutouts: Optional[int] = 4,
+        use_cutouts: Optional[bool] = True,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        # get prompt text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
+        # duplicate text embeddings for each generation per prompt
+        text_embeddings = text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if clip_guidance_scale > 0:
+            if clip_prompt is not None:
+                clip_text_input = self.tokenizer(
+                    clip_prompt,
+                    padding="max_length",
+                    max_length=self.tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                ).input_ids.to(self.device)
+            else:
+                clip_text_input = text_input.input_ids.to(self.device)
+            text_embeddings_clip = self.clip_model.get_text_features(clip_text_input)
+            text_embeddings_clip = text_embeddings_clip / text_embeddings_clip.norm(p=2, dim=-1, keepdim=True)
+            # duplicate text embeddings clip for each generation per prompt
+            text_embeddings_clip = text_embeddings_clip.repeat_interleave(num_images_per_prompt, dim=0)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            max_length = text_input.input_ids.shape[-1]
+            uncond_input = self.tokenizer([""], padding="max_length", max_length=max_length, return_tensors="pt")
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+            # duplicate unconditional embeddings for each generation per prompt
+            uncond_embeddings = uncond_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(self.device)
+
+        # set timesteps
+        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
+        extra_set_kwargs = {}
+        if accepts_offset:
+            extra_set_kwargs["offset"] = 1
+
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform classifier free guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # perform clip guidance
+            if clip_guidance_scale > 0:
+                text_embeddings_for_guidance = (
+                    text_embeddings.chunk(2)[1] if do_classifier_free_guidance else text_embeddings
+                )
+                noise_pred, latents = self.cond_fn(
+                    latents,
+                    t,
+                    i,
+                    text_embeddings_for_guidance,
+                    noise_pred,
+                    text_embeddings_clip,
+                    clip_guidance_scale,
+                    num_cutouts,
+                    use_cutouts,
+                )
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+
+        # scale and decode the image latents with vae
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, None)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None)
diff --git a/clip_guided_stable_diffusion/72392adcdf265e793b0dc13d166393a9d1367724bb03f6faca8cfb1c91c30827.8d4a13da440f0a37b6d42d3c81220be0a04e4543e6fd9b0f290547749cc06cfb.json b/clip_guided_stable_diffusion/72392adcdf265e793b0dc13d166393a9d1367724bb03f6faca8cfb1c91c30827.8d4a13da440f0a37b6d42d3c81220be0a04e4543e6fd9b0f290547749cc06cfb.json
new file mode 100644
index 000000000000..ebadb9070e07
--- /dev/null
+++ b/clip_guided_stable_diffusion/72392adcdf265e793b0dc13d166393a9d1367724bb03f6faca8cfb1c91c30827.8d4a13da440f0a37b6d42d3c81220be0a04e4543e6fd9b0f290547749cc06cfb.json
@@ -0,0 +1 @@
+{"url": "https://raw.githubusercontent.com/huggingface/diffusers/main/examples/community/clip_guided_stable_diffusion.py", "etag": "W/\"3e4886ba6cb31f36f75ec5127cd691e562bb04d1f0ff257edbe1c182fd6a210a\""}
\ No newline at end of file
diff --git a/hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/blobs/bbbcb9f65616524d6199fa3bc16dc0500fb2cbbb b/hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/blobs/bbbcb9f65616524d6199fa3bc16dc0500fb2cbbb
new file mode 100644
index 000000000000..bbbcb9f65616
--- /dev/null
+++ b/hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/blobs/bbbcb9f65616524d6199fa3bc16dc0500fb2cbbb
@@ -0,0 +1,102 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+# limitations under the License.
+
+
+from typing import Optional, Tuple, Union
+
+import torch
+
+from diffusers.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+class CustomPipeline(DiffusionPipeline):
+    r"""
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Parameters:
+        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
+            [`DDPMScheduler`], or [`DDIMScheduler`].
+    """
+
+    def __init__(self, unet, scheduler):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        generator: Optional[torch.Generator] = None,
+        eta: float = 0.0,
+        num_inference_steps: int = 50,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images to generate.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            eta (`float`, *optional*, defaults to 0.0):
+                The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM).
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+
+        # Sample gaussian noise to begin loop
+        image = torch.randn(
+            (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
+            generator=generator,
+        )
+        image = image.to(self.device)
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            # 1. predict noise model_output
+            model_output = self.unet(image, t).sample
+
+            # 2. predict previous mean of image x_t-1 and add variance depending on eta
+            # eta corresponds to η in paper and should be between [0, 1]
+            # do x_t -> x_t-1
+            image = self.scheduler.step(model_output, t, image, eta).prev_sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image), "This is a test"
\ No newline at end of file
diff --git a/hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/refs/main b/hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/refs/main
new file mode 100644
index 000000000000..152c8af6817e
--- /dev/null
+++ b/hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/refs/main
@@ -0,0 +1 @@
+b8fa12635e53eebebc22f95ee863e7af4fc2fb07
\ No newline at end of file
diff --git a/hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/snapshots/b8fa12635e53eebebc22f95ee863e7af4fc2fb07/pipeline.py b/hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/snapshots/b8fa12635e53eebebc22f95ee863e7af4fc2fb07/pipeline.py
new file mode 120000
index 000000000000..47bb96808073
--- /dev/null
+++ b/hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/snapshots/b8fa12635e53eebebc22f95ee863e7af4fc2fb07/pipeline.py
@@ -0,0 +1 @@
+../../blobs/bbbcb9f65616524d6199fa3bc16dc0500fb2cbbb
\ No newline at end of file
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 61ac2425db43..0ded715b6d08 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -46,7 +46,6 @@
         PNDMPipeline,
         RePaintPipeline,
         ScoreSdeVePipeline,
-        VQDiffusionPipeline,
     )
     from .schedulers import (
         DDIMScheduler,
@@ -77,6 +76,7 @@
         StableDiffusionInpaintPipeline,
         StableDiffusionInpaintPipelineLegacy,
         StableDiffusionPipeline,
+        VQDiffusionPipeline,
     )
 else:
     from .utils.dummy_torch_and_transformers_objects import *  # noqa F403
diff --git a/src/diffusers/utils/dummy_torch_and_accelerate_objects.py b/src/diffusers/utils/dummy_torch_and_accelerate_objects.py
index c2c423eeab26..335e3ca24d2a 100644
--- a/src/diffusers/utils/dummy_torch_and_accelerate_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_accelerate_objects.py
@@ -272,21 +272,6 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "accelerate"])
 
 
-class VQDiffusionPipeline(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
 class DDIMScheduler(metaclass=DummyObject):
     _backends = ["torch", "accelerate"]
 
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 615444425c44..bdc9e4f6c038 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -4,6 +4,21 @@
 from ..utils import DummyObject, requires_backends
 
 
+class VQDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class LDMTextToImagePipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From a24862cdaf0dedf3430b2d5cdcdabe2ebc0a7dd8 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 3 Nov 2022 17:55:14 +0000
Subject: [PATCH 125/269] Correct VQDiffusion Pipeline import

---
 ...3c81220be0a04e4543e6fd9b0f290547749cc06cfb | 324 ------------------
 ...20be0a04e4543e6fd9b0f290547749cc06cfb.json |   1 -
 .../bbbcb9f65616524d6199fa3bc16dc0500fb2cbbb  | 102 ------
 .../refs/main                                 |   1 -
 .../pipeline.py                               |   1 -
 5 files changed, 429 deletions(-)
 delete mode 100644 clip_guided_stable_diffusion/72392adcdf265e793b0dc13d166393a9d1367724bb03f6faca8cfb1c91c30827.8d4a13da440f0a37b6d42d3c81220be0a04e4543e6fd9b0f290547749cc06cfb
 delete mode 100644 clip_guided_stable_diffusion/72392adcdf265e793b0dc13d166393a9d1367724bb03f6faca8cfb1c91c30827.8d4a13da440f0a37b6d42d3c81220be0a04e4543e6fd9b0f290547749cc06cfb.json
 delete mode 100644 hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/blobs/bbbcb9f65616524d6199fa3bc16dc0500fb2cbbb
 delete mode 100644 hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/refs/main
 delete mode 120000 hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/snapshots/b8fa12635e53eebebc22f95ee863e7af4fc2fb07/pipeline.py

diff --git a/clip_guided_stable_diffusion/72392adcdf265e793b0dc13d166393a9d1367724bb03f6faca8cfb1c91c30827.8d4a13da440f0a37b6d42d3c81220be0a04e4543e6fd9b0f290547749cc06cfb b/clip_guided_stable_diffusion/72392adcdf265e793b0dc13d166393a9d1367724bb03f6faca8cfb1c91c30827.8d4a13da440f0a37b6d42d3c81220be0a04e4543e6fd9b0f290547749cc06cfb
deleted file mode 100644
index 2c86e9130fdc..000000000000
--- a/clip_guided_stable_diffusion/72392adcdf265e793b0dc13d166393a9d1367724bb03f6faca8cfb1c91c30827.8d4a13da440f0a37b6d42d3c81220be0a04e4543e6fd9b0f290547749cc06cfb
+++ /dev/null
@@ -1,324 +0,0 @@
-import inspect
-from typing import List, Optional, Union
-
-import torch
-from torch import nn
-from torch.nn import functional as F
-
-from diffusers import AutoencoderKL, DiffusionPipeline, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
-from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
-from torchvision import transforms
-from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextModel, CLIPTokenizer
-
-
-class MakeCutouts(nn.Module):
-    def __init__(self, cut_size, cut_power=1.0):
-        super().__init__()
-
-        self.cut_size = cut_size
-        self.cut_power = cut_power
-
-    def forward(self, pixel_values, num_cutouts):
-        sideY, sideX = pixel_values.shape[2:4]
-        max_size = min(sideX, sideY)
-        min_size = min(sideX, sideY, self.cut_size)
-        cutouts = []
-        for _ in range(num_cutouts):
-            size = int(torch.rand([]) ** self.cut_power * (max_size - min_size) + min_size)
-            offsetx = torch.randint(0, sideX - size + 1, ())
-            offsety = torch.randint(0, sideY - size + 1, ())
-            cutout = pixel_values[:, :, offsety : offsety + size, offsetx : offsetx + size]
-            cutouts.append(F.adaptive_avg_pool2d(cutout, self.cut_size))
-        return torch.cat(cutouts)
-
-
-def spherical_dist_loss(x, y):
-    x = F.normalize(x, dim=-1)
-    y = F.normalize(y, dim=-1)
-    return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2)
-
-
-def set_requires_grad(model, value):
-    for param in model.parameters():
-        param.requires_grad = value
-
-
-class CLIPGuidedStableDiffusion(DiffusionPipeline):
-    """CLIP guided stable diffusion based on the amazing repo by @crowsonkb and @Jack000
-    - https://github.com/Jack000/glid-3-xl
-    - https://github.dev/crowsonkb/k-diffusion
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        clip_model: CLIPModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: Union[PNDMScheduler, LMSDiscreteScheduler],
-        feature_extractor: CLIPFeatureExtractor,
-    ):
-        super().__init__()
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            clip_model=clip_model,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            feature_extractor=feature_extractor,
-        )
-
-        self.normalize = transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
-        self.make_cutouts = MakeCutouts(feature_extractor.size)
-
-        set_requires_grad(self.text_encoder, False)
-        set_requires_grad(self.clip_model, False)
-
-    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
-        self.unet.set_attention_slice(slice_size)
-
-    def disable_attention_slicing(self):
-        self.enable_attention_slicing(None)
-
-    def freeze_vae(self):
-        set_requires_grad(self.vae, False)
-
-    def unfreeze_vae(self):
-        set_requires_grad(self.vae, True)
-
-    def freeze_unet(self):
-        set_requires_grad(self.unet, False)
-
-    def unfreeze_unet(self):
-        set_requires_grad(self.unet, True)
-
-    @torch.enable_grad()
-    def cond_fn(
-        self,
-        latents,
-        timestep,
-        index,
-        text_embeddings,
-        noise_pred_original,
-        text_embeddings_clip,
-        clip_guidance_scale,
-        num_cutouts,
-        use_cutouts=True,
-    ):
-        latents = latents.detach().requires_grad_()
-
-        if isinstance(self.scheduler, LMSDiscreteScheduler):
-            sigma = self.scheduler.sigmas[index]
-            # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
-            latent_model_input = latents / ((sigma**2 + 1) ** 0.5)
-        else:
-            latent_model_input = latents
-
-        # predict the noise residual
-        noise_pred = self.unet(latent_model_input, timestep, encoder_hidden_states=text_embeddings).sample
-
-        if isinstance(self.scheduler, PNDMScheduler):
-            alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
-            beta_prod_t = 1 - alpha_prod_t
-            # compute predicted original sample from predicted noise also called
-            # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-            pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
-
-            fac = torch.sqrt(beta_prod_t)
-            sample = pred_original_sample * (fac) + latents * (1 - fac)
-        elif isinstance(self.scheduler, LMSDiscreteScheduler):
-            sigma = self.scheduler.sigmas[index]
-            sample = latents - sigma * noise_pred
-        else:
-            raise ValueError(f"scheduler type {type(self.scheduler)} not supported")
-
-        sample = 1 / 0.18215 * sample
-        image = self.vae.decode(sample).sample
-        image = (image / 2 + 0.5).clamp(0, 1)
-
-        if use_cutouts:
-            image = self.make_cutouts(image, num_cutouts)
-        else:
-            image = transforms.Resize(self.feature_extractor.size)(image)
-        image = self.normalize(image).to(latents.dtype)
-
-        image_embeddings_clip = self.clip_model.get_image_features(image)
-        image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, dim=-1, keepdim=True)
-
-        if use_cutouts:
-            dists = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip)
-            dists = dists.view([num_cutouts, sample.shape[0], -1])
-            loss = dists.sum(2).mean(0).sum() * clip_guidance_scale
-        else:
-            loss = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip).mean() * clip_guidance_scale
-
-        grads = -torch.autograd.grad(loss, latents)[0]
-
-        if isinstance(self.scheduler, LMSDiscreteScheduler):
-            latents = latents.detach() + grads * (sigma**2)
-            noise_pred = noise_pred_original
-        else:
-            noise_pred = noise_pred_original - torch.sqrt(beta_prod_t) * grads
-        return noise_pred, latents
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        height: Optional[int] = 512,
-        width: Optional[int] = 512,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        num_images_per_prompt: Optional[int] = 1,
-        clip_guidance_scale: Optional[float] = 100,
-        clip_prompt: Optional[Union[str, List[str]]] = None,
-        num_cutouts: Optional[int] = 4,
-        use_cutouts: Optional[bool] = True,
-        generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-    ):
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        # get prompt text embeddings
-        text_input = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
-        # duplicate text embeddings for each generation per prompt
-        text_embeddings = text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
-
-        if clip_guidance_scale > 0:
-            if clip_prompt is not None:
-                clip_text_input = self.tokenizer(
-                    clip_prompt,
-                    padding="max_length",
-                    max_length=self.tokenizer.model_max_length,
-                    truncation=True,
-                    return_tensors="pt",
-                ).input_ids.to(self.device)
-            else:
-                clip_text_input = text_input.input_ids.to(self.device)
-            text_embeddings_clip = self.clip_model.get_text_features(clip_text_input)
-            text_embeddings_clip = text_embeddings_clip / text_embeddings_clip.norm(p=2, dim=-1, keepdim=True)
-            # duplicate text embeddings clip for each generation per prompt
-            text_embeddings_clip = text_embeddings_clip.repeat_interleave(num_images_per_prompt, dim=0)
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            max_length = text_input.input_ids.shape[-1]
-            uncond_input = self.tokenizer([""], padding="max_length", max_length=max_length, return_tensors="pt")
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
-            # duplicate unconditional embeddings for each generation per prompt
-            uncond_embeddings = uncond_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-        # get the initial random noise unless the user supplied it
-
-        # Unlike in other pipelines, latents need to be generated in the target device
-        # for 1-to-1 results reproducibility with the CompVis implementation.
-        # However this currently doesn't work in `mps`.
-        latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
-        latents_dtype = text_embeddings.dtype
-        if latents is None:
-            if self.device.type == "mps":
-                # randn does not work reproducibly on mps
-                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
-                    self.device
-                )
-            else:
-                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
-        else:
-            if latents.shape != latents_shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-            latents = latents.to(self.device)
-
-        # set timesteps
-        accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys())
-        extra_set_kwargs = {}
-        if accepts_offset:
-            extra_set_kwargs["offset"] = 1
-
-        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
-
-        # Some schedulers like PNDM have timesteps as arrays
-        # It's more optimized to move all timesteps to correct device beforehand
-        timesteps_tensor = self.scheduler.timesteps.to(self.device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-
-        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-            # predict the noise residual
-            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-
-            # perform classifier free guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # perform clip guidance
-            if clip_guidance_scale > 0:
-                text_embeddings_for_guidance = (
-                    text_embeddings.chunk(2)[1] if do_classifier_free_guidance else text_embeddings
-                )
-                noise_pred, latents = self.cond_fn(
-                    latents,
-                    t,
-                    i,
-                    text_embeddings_for_guidance,
-                    noise_pred,
-                    text_embeddings_clip,
-                    clip_guidance_scale,
-                    num_cutouts,
-                    use_cutouts,
-                )
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
-
-        # scale and decode the image latents with vae
-        latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
-
-        image = (image / 2 + 0.5).clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).numpy()
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image, None)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None)
diff --git a/clip_guided_stable_diffusion/72392adcdf265e793b0dc13d166393a9d1367724bb03f6faca8cfb1c91c30827.8d4a13da440f0a37b6d42d3c81220be0a04e4543e6fd9b0f290547749cc06cfb.json b/clip_guided_stable_diffusion/72392adcdf265e793b0dc13d166393a9d1367724bb03f6faca8cfb1c91c30827.8d4a13da440f0a37b6d42d3c81220be0a04e4543e6fd9b0f290547749cc06cfb.json
deleted file mode 100644
index ebadb9070e07..000000000000
--- a/clip_guided_stable_diffusion/72392adcdf265e793b0dc13d166393a9d1367724bb03f6faca8cfb1c91c30827.8d4a13da440f0a37b6d42d3c81220be0a04e4543e6fd9b0f290547749cc06cfb.json
+++ /dev/null
@@ -1 +0,0 @@
-{"url": "https://raw.githubusercontent.com/huggingface/diffusers/main/examples/community/clip_guided_stable_diffusion.py", "etag": "W/\"3e4886ba6cb31f36f75ec5127cd691e562bb04d1f0ff257edbe1c182fd6a210a\""}
\ No newline at end of file
diff --git a/hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/blobs/bbbcb9f65616524d6199fa3bc16dc0500fb2cbbb b/hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/blobs/bbbcb9f65616524d6199fa3bc16dc0500fb2cbbb
deleted file mode 100644
index bbbcb9f65616..000000000000
--- a/hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/blobs/bbbcb9f65616524d6199fa3bc16dc0500fb2cbbb
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-# limitations under the License.
-
-
-from typing import Optional, Tuple, Union
-
-import torch
-
-from diffusers.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-
-
-class CustomPipeline(DiffusionPipeline):
-    r"""
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Parameters:
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
-            [`DDPMScheduler`], or [`DDIMScheduler`].
-    """
-
-    def __init__(self, unet, scheduler):
-        super().__init__()
-        self.register_modules(unet=unet, scheduler=scheduler)
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        batch_size: int = 1,
-        generator: Optional[torch.Generator] = None,
-        eta: float = 0.0,
-        num_inference_steps: int = 50,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        **kwargs,
-    ) -> Union[ImagePipelineOutput, Tuple]:
-        r"""
-        Args:
-            batch_size (`int`, *optional*, defaults to 1):
-                The number of images to generate.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
-            eta (`float`, *optional*, defaults to 0.0):
-                The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM).
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
-            `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
-            generated images.
-        """
-
-        # Sample gaussian noise to begin loop
-        image = torch.randn(
-            (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
-            generator=generator,
-        )
-        image = image.to(self.device)
-
-        # set step values
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        for t in self.progress_bar(self.scheduler.timesteps):
-            # 1. predict noise model_output
-            model_output = self.unet(image, t).sample
-
-            # 2. predict previous mean of image x_t-1 and add variance depending on eta
-            # eta corresponds to η in paper and should be between [0, 1]
-            # do x_t -> x_t-1
-            image = self.scheduler.step(model_output, t, image, eta).prev_sample
-
-        image = (image / 2 + 0.5).clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).numpy()
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,)
-
-        return ImagePipelineOutput(images=image), "This is a test"
\ No newline at end of file
diff --git a/hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/refs/main b/hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/refs/main
deleted file mode 100644
index 152c8af6817e..000000000000
--- a/hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/refs/main
+++ /dev/null
@@ -1 +0,0 @@
-b8fa12635e53eebebc22f95ee863e7af4fc2fb07
\ No newline at end of file
diff --git a/hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/snapshots/b8fa12635e53eebebc22f95ee863e7af4fc2fb07/pipeline.py b/hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/snapshots/b8fa12635e53eebebc22f95ee863e7af4fc2fb07/pipeline.py
deleted file mode 120000
index 47bb96808073..000000000000
--- a/hf-internal-testing/diffusers-dummy-pipeline/models--hf-internal-testing--diffusers-dummy-pipeline/snapshots/b8fa12635e53eebebc22f95ee863e7af4fc2fb07/pipeline.py
+++ /dev/null
@@ -1 +0,0 @@
-../../blobs/bbbcb9f65616524d6199fa3bc16dc0500fb2cbbb
\ No newline at end of file

From bde4880c9cceada20b387d3110061c65249dabcc Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 3 Nov 2022 17:57:51 +0000
Subject: [PATCH 126/269] make style

---
 .../utils/dummy_torch_and_transformers_objects.py    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index bdc9e4f6c038..ea85a8f57e8f 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -4,7 +4,7 @@
 from ..utils import DummyObject, requires_backends
 
 
-class VQDiffusionPipeline(metaclass=DummyObject):
+class LDMTextToImagePipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -19,7 +19,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class LDMTextToImagePipeline(metaclass=DummyObject):
+class StableDiffusionImg2ImgPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -34,7 +34,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class StableDiffusionImg2ImgPipeline(metaclass=DummyObject):
+class StableDiffusionInpaintPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -49,7 +49,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class StableDiffusionInpaintPipeline(metaclass=DummyObject):
+class StableDiffusionInpaintPipelineLegacy(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -64,7 +64,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class StableDiffusionInpaintPipelineLegacy(metaclass=DummyObject):
+class StableDiffusionPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -79,7 +79,7 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class StableDiffusionPipeline(metaclass=DummyObject):
+class VQDiffusionPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):

From c62b3a2e7e9cb1665db1062d96cc13118350ad8d Mon Sep 17 00:00:00 2001
From: "Duong A. Nguyen" <38061659+duongna21@users.noreply.github.com>
Date: Fri, 4 Nov 2022 19:49:57 +0700
Subject: [PATCH 127/269] [Flax] Fix sample batch size DreamBooth  (#1129)

fix sample batch size
---
 examples/dreambooth/train_dreambooth_flax.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/dreambooth/train_dreambooth_flax.py b/examples/dreambooth/train_dreambooth_flax.py
index d2652606b551..84493b1d9484 100644
--- a/examples/dreambooth/train_dreambooth_flax.py
+++ b/examples/dreambooth/train_dreambooth_flax.py
@@ -361,7 +361,8 @@ def main():
             logger.info(f"Number of class images to sample: {num_new_images}.")
 
             sample_dataset = PromptDataset(args.class_prompt, num_new_images)
-            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+            total_sample_batch_size = args.sample_batch_size * jax.local_device_count()
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=total_sample_batch_size)
 
             for example in tqdm(
                 sample_dataloader, desc="Generating class images", disable=not jax.process_index() == 0

From 1d0f3c211ebb13897859bd457c218c64b5e8a805 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 4 Nov 2022 14:58:52 +0100
Subject: [PATCH 128/269] Move accelerate to a soft-dependency (#1134)

* finish

* finish

* Update src/diffusers/modeling_utils.py

* Update src/diffusers/pipeline_utils.py

Co-authored-by: Anton Lozhkov <anton@huggingface.co>

* more fixes

* fix

Co-authored-by: Anton Lozhkov <anton@huggingface.co>
---
 src/diffusers/__init__.py                     |   8 -
 src/diffusers/modeling_utils.py               |  34 +-
 src/diffusers/pipeline_utils.py               |  12 +-
 src/diffusers/utils/__init__.py               |   1 +
 src/diffusers/utils/dummy_pt_objects.py       |  15 -
 .../dummy_torch_and_accelerate_objects.py     | 452 ------------------
 src/diffusers/utils/import_utils.py           |  38 ++
 tests/repo_utils/test_check_dummies.py        |   4 +-
 8 files changed, 82 insertions(+), 482 deletions(-)
 delete mode 100644 src/diffusers/utils/dummy_torch_and_accelerate_objects.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 22b6589973a0..e4a69641d580 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -1,5 +1,4 @@
 from .utils import (
-    is_accelerate_available,
     is_flax_available,
     is_inflect_available,
     is_onnx_available,
@@ -17,13 +16,6 @@
 from .utils import logging
 
 
-# This will create an extra dummy file "dummy_torch_and_accelerate_objects.py"
-# TODO: (patil-suraj, anton-l) maybe import everything under is_torch_and_accelerate_available
-if is_torch_available() and not is_accelerate_available():
-    error_msg = "Please install the `accelerate` library to use Diffusers with PyTorch. You can do so by running `pip install diffusers[torch]`. Or if torch is already installed, you can run `pip install accelerate`."  # noqa: E501
-    raise ImportError(error_msg)
-
-
 if is_torch_available():
     from .modeling_utils import ModelMixin
     from .models import AutoencoderKL, Transformer2DModel, UNet1DModel, UNet2DConditionModel, UNet2DModel, VQModel
diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py
index 9e05672bf163..1e91ccd56a15 100644
--- a/src/diffusers/modeling_utils.py
+++ b/src/diffusers/modeling_utils.py
@@ -21,15 +21,20 @@
 import torch
 from torch import Tensor, device
 
-import accelerate
-from accelerate.utils import set_module_tensor_to_device
-from accelerate.utils.versions import is_torch_version
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError
 from requests import HTTPError
 
 from . import __version__
-from .utils import CONFIG_NAME, DIFFUSERS_CACHE, HUGGINGFACE_CO_RESOLVE_ENDPOINT, WEIGHTS_NAME, logging
+from .utils import (
+    CONFIG_NAME,
+    DIFFUSERS_CACHE,
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    WEIGHTS_NAME,
+    is_accelerate_available,
+    is_torch_version,
+    logging,
+)
 
 
 logger = logging.get_logger(__name__)
@@ -41,6 +46,12 @@
     _LOW_CPU_MEM_USAGE_DEFAULT = False
 
 
+if is_accelerate_available():
+    import accelerate
+    from accelerate.utils import set_module_tensor_to_device
+    from accelerate.utils.versions import is_torch_version
+
+
 def get_parameter_device(parameter: torch.nn.Module):
     try:
         return next(parameter.parameters()).device
@@ -319,6 +330,21 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         device_map = kwargs.pop("device_map", None)
         low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
 
+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warn(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+
+        if device_map is not None and not is_accelerate_available():
+            raise NotImplementedError(
+                "Loading and dispatching requires `accelerate`. Please make sure to install accelerate or set"
+                " `device_map=None`. You can install accelerate with `pip install accelerate`."
+            )
+
         # Check if we can handle device_map and dispatching the weights
         if device_map is not None and not is_torch_version(">=", "1.9.0"):
             raise NotImplementedError(
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 36c2d5b888ef..97e196e72338 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -25,7 +25,6 @@
 
 import diffusers
 import PIL
-from accelerate.utils.versions import is_torch_version
 from huggingface_hub import snapshot_download
 from packaging import version
 from PIL import Image
@@ -43,6 +42,8 @@
     WEIGHTS_NAME,
     BaseOutput,
     deprecate,
+    is_accelerate_available,
+    is_torch_version,
     is_transformers_available,
     logging,
 )
@@ -397,6 +398,15 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         device_map = kwargs.pop("device_map", None)
         low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
 
+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warn(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+
         if device_map is not None and not is_torch_version(">=", "1.9.0"):
             raise NotImplementedError(
                 "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index 7395f4edfa26..3fa477e7dce8 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -31,6 +31,7 @@
     is_scipy_available,
     is_tf_available,
     is_torch_available,
+    is_torch_version,
     is_transformers_available,
     is_unidecode_available,
     requires_backends,
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 833f2b6c5074..25aa82d6c5b2 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -272,21 +272,6 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
-class VQDiffusionPipeline(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-
 class DDIMScheduler(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/diffusers/utils/dummy_torch_and_accelerate_objects.py b/src/diffusers/utils/dummy_torch_and_accelerate_objects.py
deleted file mode 100644
index 335e3ca24d2a..000000000000
--- a/src/diffusers/utils/dummy_torch_and_accelerate_objects.py
+++ /dev/null
@@ -1,452 +0,0 @@
-# This file is autogenerated by the command `make fix-copies`, do not edit.
-# flake8: noqa
-
-from ..utils import DummyObject, requires_backends
-
-
-class ModelMixin(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class AutoencoderKL(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class Transformer2DModel(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class UNet1DModel(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class UNet2DConditionModel(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class UNet2DModel(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class VQModel(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-def get_constant_schedule(*args, **kwargs):
-    requires_backends(get_constant_schedule, ["torch", "accelerate"])
-
-
-def get_constant_schedule_with_warmup(*args, **kwargs):
-    requires_backends(get_constant_schedule_with_warmup, ["torch", "accelerate"])
-
-
-def get_cosine_schedule_with_warmup(*args, **kwargs):
-    requires_backends(get_cosine_schedule_with_warmup, ["torch", "accelerate"])
-
-
-def get_cosine_with_hard_restarts_schedule_with_warmup(*args, **kwargs):
-    requires_backends(get_cosine_with_hard_restarts_schedule_with_warmup, ["torch", "accelerate"])
-
-
-def get_linear_schedule_with_warmup(*args, **kwargs):
-    requires_backends(get_linear_schedule_with_warmup, ["torch", "accelerate"])
-
-
-def get_polynomial_decay_schedule_with_warmup(*args, **kwargs):
-    requires_backends(get_polynomial_decay_schedule_with_warmup, ["torch", "accelerate"])
-
-
-def get_scheduler(*args, **kwargs):
-    requires_backends(get_scheduler, ["torch", "accelerate"])
-
-
-class DiffusionPipeline(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class DanceDiffusionPipeline(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class DDIMPipeline(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class DDPMPipeline(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class KarrasVePipeline(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class LDMPipeline(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class PNDMPipeline(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class RePaintPipeline(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class ScoreSdeVePipeline(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class DDIMScheduler(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class DDPMScheduler(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class EulerAncestralDiscreteScheduler(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class EulerDiscreteScheduler(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class IPNDMScheduler(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class KarrasVeScheduler(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class PNDMScheduler(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class RePaintScheduler(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class SchedulerMixin(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class ScoreSdeVeScheduler(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class VQDiffusionScheduler(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-
-class EMAModel(metaclass=DummyObject):
-    _backends = ["torch", "accelerate"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "accelerate"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "accelerate"])
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index 4ea02dcc94da..005cbb6170f4 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -15,11 +15,14 @@
 Import utilities: Utilities related to imports and our lazy inits.
 """
 import importlib.util
+import operator as op
 import os
 import sys
 from collections import OrderedDict
+from typing import Union
 
 from packaging import version
+from packaging.version import Version, parse
 
 from . import logging
 
@@ -40,6 +43,8 @@
 USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
 USE_JAX = os.environ.get("USE_FLAX", "AUTO").upper()
 
+STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt}
+
 _torch_version = "N/A"
 if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
     _torch_available = importlib.util.find_spec("torch") is not None
@@ -309,3 +314,36 @@ def __getattr__(cls, key):
         if key.startswith("_"):
             return super().__getattr__(cls, key)
         requires_backends(cls, cls._backends)
+
+
+# This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L319
+def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str):
+    """
+    Args:
+    Compares a library version to some requirement using a given operation.
+        library_or_version (`str` or `packaging.version.Version`):
+            A library name or a version to check.
+        operation (`str`):
+            A string representation of an operator, such as `">"` or `"<="`.
+        requirement_version (`str`):
+            The version to compare the library version against
+    """
+    if operation not in STR_OPERATION_TO_FUNC.keys():
+        raise ValueError(f"`operation` must be one of {list(STR_OPERATION_TO_FUNC.keys())}, received {operation}")
+    operation = STR_OPERATION_TO_FUNC[operation]
+    if isinstance(library_or_version, str):
+        library_or_version = parse(importlib_metadata.version(library_or_version))
+    return operation(library_or_version, parse(requirement_version))
+
+
+# This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L338
+def is_torch_version(operation: str, version: str):
+    """
+    Args:
+    Compares the current PyTorch version to a given reference with an operation.
+        operation (`str`):
+            A string representation of an operator, such as `">"` or `"<="`
+        version (`str`):
+            A string version of PyTorch
+    """
+    return compare_versions(parse(_torch_version), operation, version)
diff --git a/tests/repo_utils/test_check_dummies.py b/tests/repo_utils/test_check_dummies.py
index 0331b5e8c2cc..d8fa9ce10547 100644
--- a/tests/repo_utils/test_check_dummies.py
+++ b/tests/repo_utils/test_check_dummies.py
@@ -52,13 +52,13 @@ def test_find_backend(self):
     def test_read_init(self):
         objects = read_init()
         # We don't assert on the exact list of keys to allow for smooth grow of backend-specific objects
-        self.assertIn("torch_and_accelerate", objects)
+        self.assertIn("torch", objects)
         self.assertIn("torch_and_transformers", objects)
         self.assertIn("flax_and_transformers", objects)
         self.assertIn("torch_and_transformers_and_onnx", objects)
 
         # Likewise, we can't assert on the exact content of a key
-        self.assertIn("UNet2DModel", objects["torch_and_accelerate"])
+        self.assertIn("UNet2DModel", objects["torch"])
         self.assertIn("FlaxUNet2DConditionModel", objects["flax"])
         self.assertIn("StableDiffusionPipeline", objects["torch_and_transformers"])
         self.assertIn("FlaxStableDiffusionPipeline", objects["flax_and_transformers"])

From af7b1c3bf233346e9bf100d0dc6fc5f752bc5cca Mon Sep 17 00:00:00 2001
From: webbigdata-jp <87654083+webbigdata-jp@users.noreply.github.com>
Date: Sat, 5 Nov 2022 00:45:58 +0900
Subject: [PATCH 129/269] fix 404 link in example/README.mb (#1136)

fix 404 link in README.mb
---
 examples/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 56b1b81b9060..29872a7a164e 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -38,9 +38,9 @@ Training examples show how to pretrain or fine-tune diffusion models for a varie
 
 | Task | 🤗 Accelerate | 🤗 Datasets | Colab
 |---|---|:---:|:---:|
-| [**Unconditional Image Generation**](./unconditional_training) | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
-| [**Text-to-Image fine-tuning**](./text2image) | ✅ | ✅ | 
-| [**Textual Inversion**](./text_inversion) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
+| [**Unconditional Image Generation**](./unconditional_image_generation) | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [**Text-to-Image fine-tuning**](./text_to_image) | ✅ | ✅ | 
+| [**Textual Inversion**](./textual_inversion) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
 | [**Dreambooth**](./dreambooth) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb)
 
 

From 2c108693cc20bf5f1a134d09a8cea1c49122d56e Mon Sep 17 00:00:00 2001
From: Lewington-pitsos <lewingtonpitsos@gmail.com>
Date: Sat, 5 Nov 2022 03:54:01 +1100
Subject: [PATCH 130/269] Test precision increases  (#1113)

* increase the precision of slice-based tests and make the default test case easier to single out

* increase precision of unit tests which already rely on float comparisons

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../stable_diffusion/test_stable_diffusion.py | 12 +++---
 .../test_stable_diffusion_img2img.py          | 38 +++++++++----------
 2 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index b01094a607ae..89fac46e74c6 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -34,7 +34,7 @@
     UNet2DModel,
     VQModel,
 )
-from diffusers.utils import floats_tensor, load_image, slow, torch_device
+from diffusers.utils import floats_tensor, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
@@ -753,12 +753,10 @@ def test_stable_diffusion_text2img_pipeline_fp16(self):
         # however, they should be extremely close.
         assert diff.mean() < 2e-2
 
-    def test_stable_diffusion_text2img_pipeline(self):
-        expected_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/text2img/astronaut_riding_a_horse.png"
+    def test_stable_diffusion_text2img_pipeline_default(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/lewington/expected-images/resolve/main/astronaut_riding_a_horse.npy"
         )
-        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
 
         model_id = "CompVis/stable-diffusion-v1-4"
         pipe = StableDiffusionPipeline.from_pretrained(model_id, safety_checker=None)
@@ -773,7 +771,7 @@ def test_stable_diffusion_text2img_pipeline(self):
         image = output.images[0]
 
         assert image.shape == (512, 512, 3)
-        assert np.abs(expected_image - image).max() < 1e-2
+        assert np.abs(expected_image - image).max() < 1e-3
 
     def test_stable_diffusion_text2img_intermediate_state(self):
         number_of_steps = 0
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index f5a92fac876f..ca8bc191d2c7 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -29,7 +29,7 @@
     UNet2DModel,
     VQModel,
 )
-from diffusers.utils import floats_tensor, load_image, slow, torch_device
+from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
@@ -156,7 +156,7 @@ def to(self, device):
 
         return extract
 
-    def test_stable_diffusion_img2img(self):
+    def test_stable_diffusion_img2img_default_case(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         unet = self.dummy_cond_unet
         scheduler = PNDMScheduler(skip_prk_steps=True)
@@ -208,8 +208,8 @@ def test_stable_diffusion_img2img(self):
 
         assert image.shape == (1, 32, 32, 3)
         expected_slice = np.array([0.4492, 0.3865, 0.4222, 0.5854, 0.5139, 0.4379, 0.4193, 0.48, 0.4218])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-3
 
     def test_stable_diffusion_img2img_negative_prompt(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
@@ -251,7 +251,7 @@ def test_stable_diffusion_img2img_negative_prompt(self):
 
         assert image.shape == (1, 32, 32, 3)
         expected_slice = np.array([0.4065, 0.3783, 0.4050, 0.5266, 0.4781, 0.4252, 0.4203, 0.4692, 0.4365])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
     def test_stable_diffusion_img2img_multiple_init_images(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
@@ -293,7 +293,7 @@ def test_stable_diffusion_img2img_multiple_init_images(self):
 
         assert image.shape == (2, 32, 32, 3)
         expected_slice = np.array([0.5144, 0.4447, 0.4735, 0.6676, 0.5526, 0.5454, 0.645, 0.5149, 0.4689])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
     def test_stable_diffusion_img2img_k_lms(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
@@ -348,8 +348,8 @@ def test_stable_diffusion_img2img_k_lms(self):
 
         assert image.shape == (1, 32, 32, 3)
         expected_slice = np.array([0.4367, 0.4986, 0.4372, 0.6706, 0.5665, 0.444, 0.5864, 0.6019, 0.5203])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-3
 
     def test_stable_diffusion_img2img_num_images_per_prompt(self):
         device = "cpu"
@@ -472,17 +472,15 @@ def tearDown(self):
         gc.collect()
         torch.cuda.empty_cache()
 
-    def test_stable_diffusion_img2img_pipeline(self):
+    def test_stable_diffusion_img2img_pipeline_default(self):
         init_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/img2img/sketch-mountains-input.jpg"
         )
-        expected_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/fantasy_landscape.png"
-        )
         init_image = init_image.resize((768, 512))
-        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/lewington/expected-images/resolve/main/fantasy_landscape.npy"
+        )
 
         model_id = "CompVis/stable-diffusion-v1-4"
         pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
@@ -508,19 +506,17 @@ def test_stable_diffusion_img2img_pipeline(self):
 
         assert image.shape == (512, 768, 3)
         # img2img is flaky across GPUs even in fp32, so using MAE here
-        assert np.abs(expected_image - image).mean() < 1e-2
+        assert np.abs(expected_image - image).mean() < 1e-3
 
     def test_stable_diffusion_img2img_pipeline_k_lms(self):
         init_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/img2img/sketch-mountains-input.jpg"
         )
-        expected_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/fantasy_landscape_k_lms.png"
-        )
         init_image = init_image.resize((768, 512))
-        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/lewington/expected-images/resolve/main/fantasy_landscape_k_lms.npy"
+        )
 
         model_id = "CompVis/stable-diffusion-v1-4"
         lms = LMSDiscreteScheduler.from_config(model_id, subfolder="scheduler")
@@ -548,7 +544,7 @@ def test_stable_diffusion_img2img_pipeline_k_lms(self):
 
         assert image.shape == (512, 768, 3)
         # img2img is flaky across GPUs even in fp32, so using MAE here
-        assert np.abs(expected_image - image).mean() < 1e-2
+        assert np.abs(expected_image - image).mean() < 1e-3
 
     def test_stable_diffusion_img2img_intermediate_state(self):
         number_of_steps = 0

From 5b20d3b3d7ca38c298bc8425897efbc3b4aece6f Mon Sep 17 00:00:00 2001
From: Chenguo Lin <linchenguo0622@gmail.com>
Date: Sat, 5 Nov 2022 01:05:19 +0800
Subject: [PATCH 131/269] fix the parameter naming in `self.downsamplers`
 (#1108)

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/unet_2d_blocks.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index 4132ccbd0ce0..770043f053b2 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -462,7 +462,7 @@ def __init__(
             self.downsamplers = nn.ModuleList(
                 [
                     Downsample2D(
-                        in_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
                     )
                 ]
             )
@@ -546,7 +546,7 @@ def __init__(
             self.downsamplers = nn.ModuleList(
                 [
                     Downsample2D(
-                        in_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
                     )
                 ]
             )
@@ -651,7 +651,7 @@ def __init__(
             self.downsamplers = nn.ModuleList(
                 [
                     Downsample2D(
-                        in_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
                     )
                 ]
             )
@@ -729,7 +729,7 @@ def __init__(
             self.downsamplers = nn.ModuleList(
                 [
                     Downsample2D(
-                        in_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
                     )
                 ]
             )
@@ -801,7 +801,7 @@ def __init__(
             self.downsamplers = nn.ModuleList(
                 [
                     Downsample2D(
-                        in_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
                     )
                 ]
             )
@@ -886,7 +886,7 @@ def __init__(
                 down=True,
                 kernel="fir",
             )
-            self.downsamplers = nn.ModuleList([FirDownsample2D(in_channels, out_channels=out_channels)])
+            self.downsamplers = nn.ModuleList([FirDownsample2D(out_channels, out_channels=out_channels)])
             self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
         else:
             self.resnet_down = None
@@ -966,7 +966,7 @@ def __init__(
                 down=True,
                 kernel="fir",
             )
-            self.downsamplers = nn.ModuleList([FirDownsample2D(in_channels, out_channels=out_channels)])
+            self.downsamplers = nn.ModuleList([FirDownsample2D(out_channels, out_channels=out_channels)])
             self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
         else:
             self.resnet_down = None

From a4802294630b823125a836c8d02b83aac13eab6a Mon Sep 17 00:00:00 2001
From: SkyTNT <SKYTNT@outlook.com>
Date: Sat, 5 Nov 2022 01:38:37 +0800
Subject: [PATCH 132/269] [Community Pipeline] lpw_stable_diffusion: add
 xformers_memory_efficient_attention and sequential_cpu_offload  (#1130)

lpw_stable_diffusion: xformers and cpu_offload
---
 examples/community/lpw_stable_diffusion.py    | 60 +++++++++++++++++--
 .../community/lpw_stable_diffusion_onnx.py    |  2 +-
 2 files changed, 56 insertions(+), 6 deletions(-)

diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py
index 74aed2fec86f..39a266dec972 100644
--- a/examples/community/lpw_stable_diffusion.py
+++ b/examples/community/lpw_stable_diffusion.py
@@ -12,7 +12,7 @@
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from diffusers.utils import deprecate, logging
+from diffusers.utils import deprecate, is_accelerate_available, logging
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 
@@ -340,13 +340,15 @@ def get_weighted_text_embeddings(
     # assign weights to the prompts and normalize in the sense of mean
     # TODO: should we normalize by chunk or in a whole (current implementation)?
     if (not skip_parsing) and (not skip_weighting):
-        previous_mean = text_embeddings.mean(axis=[-2, -1])
+        previous_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
         text_embeddings *= prompt_weights.unsqueeze(-1)
-        text_embeddings *= (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
+        current_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+        text_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
         if uncond_prompt is not None:
-            previous_mean = uncond_embeddings.mean(axis=[-2, -1])
+            previous_mean = uncond_embeddings.float().mean(axis=[-2, -1]).to(uncond_embeddings.dtype)
             uncond_embeddings *= uncond_weights.unsqueeze(-1)
-            uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1)
+            current_mean = uncond_embeddings.float().mean(axis=[-2, -1]).to(uncond_embeddings.dtype)
+            uncond_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
 
     if uncond_prompt is not None:
         return text_embeddings, uncond_embeddings
@@ -431,6 +433,19 @@ def __init__(
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
         if safety_checker is None:
             logger.warn(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
@@ -451,6 +466,24 @@ def __init__(
             feature_extractor=feature_extractor,
         )
 
+    def enable_xformers_memory_efficient_attention(self):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(False)
+
     def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
         r"""
         Enable sliced attention computation.
@@ -478,6 +511,23 @@ def disable_attention_slicing(self):
         # set slice_size = `None` to disable `attention slicing`
         self.enable_attention_slicing(None)
 
+    def enable_sequential_cpu_offload(self):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = self.device
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
     @torch.no_grad()
     def __call__(
         self,
diff --git a/examples/community/lpw_stable_diffusion_onnx.py b/examples/community/lpw_stable_diffusion_onnx.py
index 69b942f9ef1a..12e306a612e1 100644
--- a/examples/community/lpw_stable_diffusion_onnx.py
+++ b/examples/community/lpw_stable_diffusion_onnx.py
@@ -701,7 +701,7 @@ def __call__(
                     clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
                 )
                 images.append(image_i)
-                has_nsfw_concept.append(has_nsfw_concept_i)
+                has_nsfw_concept.append(has_nsfw_concept_i[0])
             image = np.concatenate(images)
         else:
             has_nsfw_concept = None

From 2fcae69f2a181960c37be8e3081b9da0f2dc3b21 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Fri, 4 Nov 2022 19:06:24 +0100
Subject: [PATCH 133/269] Bump to 0.8.0.dev0 (#1131)

* Bump to 0.8.0.dev0

* deprecate int timesteps

* style
---
 setup.py                                      |  2 +-
 src/diffusers/__init__.py                     |  2 +-
 .../scheduling_euler_ancestral_discrete.py    | 16 ++-------
 .../schedulers/scheduling_euler_discrete.py   | 16 ++-------
 .../schedulers/scheduling_lms_discrete.py     | 33 ++-----------------
 5 files changed, 9 insertions(+), 60 deletions(-)

diff --git a/setup.py b/setup.py
index 16b64fe81ac3..1bb6af4b10bf 100644
--- a/setup.py
+++ b/setup.py
@@ -210,7 +210,7 @@ def run(self):
 
 setup(
     name="diffusers",
-    version="0.7.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="0.8.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     description="Diffusers",
     long_description=open("README.md", "r", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index e4a69641d580..9c954c38165b 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -9,7 +9,7 @@
 )
 
 
-__version__ = "0.7.0"
+__version__ = "0.8.0.dev0"
 
 from .configuration_utils import ConfigMixin
 from .onnx_utils import OnnxRuntimeModel
diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
index 134b45a73b08..fe45b3d591f5 100644
--- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -19,7 +19,7 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, deprecate, logging
+from ..utils import BaseOutput, logging
 from .scheduling_utils import SchedulerMixin
 
 
@@ -253,19 +253,7 @@ def add_noise(
             timesteps = timesteps.to(original_samples.device)
 
         schedule_timesteps = self.timesteps
-
-        if isinstance(timesteps, torch.IntTensor) or isinstance(timesteps, torch.LongTensor):
-            deprecate(
-                "timesteps as indices",
-                "0.8.0",
-                "Passing integer indices  (e.g. from `enumerate(timesteps)`) as timesteps to"
-                " `EulerAncestralDiscreteScheduler.add_noise()` will not be supported in future versions. Make sure to"
-                " pass values from `scheduler.timesteps` as timesteps.",
-                standard_warn=False,
-            )
-            step_indices = timesteps
-        else:
-            step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
 
         sigma = self.sigmas[step_indices].flatten()
         while len(sigma.shape) < len(original_samples.shape):
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 6425072ac3c9..0cb31a451272 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -19,7 +19,7 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, deprecate, logging
+from ..utils import BaseOutput, logging
 from .scheduling_utils import SchedulerMixin
 
 
@@ -262,19 +262,7 @@ def add_noise(
             timesteps = timesteps.to(original_samples.device)
 
         schedule_timesteps = self.timesteps
-
-        if isinstance(timesteps, torch.IntTensor) or isinstance(timesteps, torch.LongTensor):
-            deprecate(
-                "timesteps as indices",
-                "0.8.0",
-                "Passing integer indices  (e.g. from `enumerate(timesteps)`) as timesteps to"
-                " `EulerDiscreteScheduler.add_noise()` will not be supported in future versions. Make sure to"
-                " pass values from `scheduler.timesteps` as timesteps.",
-                standard_warn=False,
-            )
-            step_indices = timesteps
-        else:
-            step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
 
         sigma = self.sigmas[step_indices].flatten()
         while len(sigma.shape) < len(original_samples.shape):
diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py
index 43e577b40936..8d633267c607 100644
--- a/src/diffusers/schedulers/scheduling_lms_discrete.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete.py
@@ -21,7 +21,7 @@
 from scipy import integrate
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, deprecate
+from ..utils import BaseOutput
 from .scheduling_utils import SchedulerMixin
 
 
@@ -211,22 +211,7 @@ def step(
 
         if isinstance(timestep, torch.Tensor):
             timestep = timestep.to(self.timesteps.device)
-        if (
-            isinstance(timestep, int)
-            or isinstance(timestep, torch.IntTensor)
-            or isinstance(timestep, torch.LongTensor)
-        ):
-            deprecate(
-                "timestep as an index",
-                "0.8.0",
-                "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
-                " `LMSDiscreteScheduler.step()` will not be supported in future versions. Make sure to pass"
-                " one of the `scheduler.timesteps` as a timestep.",
-                standard_warn=False,
-            )
-            step_index = timestep
-        else:
-            step_index = (self.timesteps == timestep).nonzero().item()
+        step_index = (self.timesteps == timestep).nonzero().item()
         sigma = self.sigmas[step_index]
 
         # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
@@ -269,19 +254,7 @@ def add_noise(
             timesteps = timesteps.to(original_samples.device)
 
         schedule_timesteps = self.timesteps
-
-        if isinstance(timesteps, torch.IntTensor) or isinstance(timesteps, torch.LongTensor):
-            deprecate(
-                "timesteps as indices",
-                "0.8.0",
-                "Passing integer indices  (e.g. from `enumerate(timesteps)`) as timesteps to"
-                " `LMSDiscreteScheduler.add_noise()` will not be supported in future versions. Make sure to"
-                " pass values from `scheduler.timesteps` as timesteps.",
-                standard_warn=False,
-            )
-            step_indices = timesteps
-        else:
-            step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
 
         sigma = self.sigmas[step_indices].flatten()
         while len(sigma.shape) < len(original_samples.shape):

From 1172c9634b4a32d6e82301e3d59ce17005e13e85 Mon Sep 17 00:00:00 2001
From: Pi Esposito <piero.skywalker@gmail.com>
Date: Fri, 4 Nov 2022 15:25:28 -0300
Subject: [PATCH 134/269] add enable sequential cpu offloading to other stable
 diffusion pipelines (#1085)

* add enable sequential cpu offloading to other stable diffusion pipelines

* trigger ci

* fix styling

* interpolate before converting to device to avoid breking when cpu_offload is enabled with fp16

Co-authored-by: Pedro Gengo  <pedro.gabriel.lourenco@hotmail.com>

* style again I need to stop forgething this thing

* fix inpainting bug that could cause device misalignment

Co-authored-by: Pedro Gengo  <pedro.gabriel.lourenco@hotmail.com>

* Apply suggestions from code review

Co-authored-by: Pedro Gengo  <pedro.gabriel.lourenco@hotmail.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../pipeline_stable_diffusion_img2img.py      | 18 +++++++
 .../pipeline_stable_diffusion_inpaint.py      | 28 ++++++++++-
 .../stable_diffusion/test_stable_diffusion.py |  2 +-
 .../test_stable_diffusion_img2img.py          | 45 ++++++++++++++++++
 .../test_stable_diffusion_inpaint.py          | 47 ++++++++++++++++++-
 5 files changed, 136 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 8284bac85091..08b14b36be89 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -5,6 +5,7 @@
 import torch
 
 import PIL
+from diffusers.utils import is_accelerate_available
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -151,6 +152,23 @@ def disable_attention_slicing(self):
         # set slice_size = `None` to disable `set_attention_slice`
         self.enable_attention_slicing(None)
 
+    def enable_sequential_cpu_offload(self):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device("cuda")
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
     def enable_xformers_memory_efficient_attention(self):
         r"""
         Enable memory efficient attention as implemented in xformers.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index c200892ef60a..34e8231c63ee 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -5,6 +5,7 @@
 import torch
 
 import PIL
+from diffusers.utils import is_accelerate_available
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -151,6 +152,23 @@ def disable_attention_slicing(self):
         # set slice_size = `None` to disable `attention slicing`
         self.enable_attention_slicing(None)
 
+    def enable_sequential_cpu_offload(self):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device("cuda")
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
     def enable_xformers_memory_efficient_attention(self):
         r"""
         Enable memory efficient attention as implemented in xformers.
@@ -361,11 +379,14 @@ def __call__(
 
         # prepare mask and masked_image
         mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
-        mask = mask.to(device=self.device, dtype=text_embeddings.dtype)
-        masked_image = masked_image.to(device=self.device, dtype=text_embeddings.dtype)
 
         # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
         mask = torch.nn.functional.interpolate(mask, size=(height // 8, width // 8))
+        mask = mask.to(device=self.device, dtype=text_embeddings.dtype)
+
+        masked_image = masked_image.to(device=self.device, dtype=text_embeddings.dtype)
 
         # encode the mask image into latents space so we can concatenate it to the latents
         masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
@@ -380,6 +401,9 @@ def __call__(
             torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
         )
 
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(device=self.device, dtype=text_embeddings.dtype)
+
         num_channels_mask = mask.shape[1]
         num_channels_masked_image = masked_image_latents.shape[1]
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 89fac46e74c6..a83299eaf9f3 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -840,7 +840,7 @@ def test_stable_diffusion_low_cpu_mem_usage(self):
         assert 2 * low_cpu_mem_usage_time < normal_load_time
 
     @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU")
-    def test_stable_diffusion_pipeline_with_unet_on_gpu_only(self):
+    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
         torch.cuda.empty_cache()
         torch.cuda.reset_max_memory_allocated()
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index ca8bc191d2c7..2d29e1b80644 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -599,3 +599,48 @@ def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> No
             )
         assert test_callback_fn.has_been_called
         assert number_of_steps == 38
+
+    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        )
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/fantasy_landscape_k_lms.png"
+        )
+        init_image = init_image.resize((768, 512))
+        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+
+        model_id = "CompVis/stable-diffusion-v1-4"
+        lms = LMSDiscreteScheduler.from_config(model_id, subfolder="scheduler")
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
+            model_id,
+            scheduler=lms,
+            safety_checker=None,
+            device_map="auto",
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing(1)
+        pipe.enable_sequential_cpu_offload()
+
+        prompt = "A fantasy landscape, trending on artstation"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        _ = pipe(
+            prompt=prompt,
+            init_image=init_image,
+            strength=0.75,
+            guidance_scale=7.5,
+            generator=generator,
+            output_type="np",
+            num_inference_steps=5,
+        )
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 1.5 GB is allocated
+        assert mem_bytes < 1.5 * 10**9
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 44a7a3249a7e..e8dcb43163da 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -378,4 +378,49 @@ def test_stable_diffusion_inpaint_pipeline_pndm(self):
         image = output.images[0]
 
         assert image.shape == (512, 512, 3)
-        assert np.abs(expected_image - image).max() < 1e-3
+        assert np.abs(expected_image - image).max() < 1e-2
+
+    @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU")
+    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
+        )
+        expected_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/yellow_cat_sitting_on_a_park_bench_pndm.png"
+        )
+        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
+
+        model_id = "runwayml/stable-diffusion-inpainting"
+        pndm = PNDMScheduler.from_config(model_id, subfolder="scheduler")
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            model_id, safety_checker=None, scheduler=pndm, device_map="auto"
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing(1)
+        pipe.enable_sequential_cpu_offload()
+
+        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        _ = pipe(
+            prompt=prompt,
+            image=init_image,
+            mask_image=mask_image,
+            generator=generator,
+            num_inference_steps=5,
+            output_type="np",
+        )
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 1.5 GB is allocated
+        assert mem_bytes < 1.5 * 10**9

From 9d8943b7e7361e8527fc662d9769707087c4bad6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Chen=20Wu=20=28=E5=90=B4=E5=B0=98=29?= <henrychenwu@cmu.edu>
Date: Fri, 4 Nov 2022 15:51:06 -0400
Subject: [PATCH 135/269] Add CycleDiffusion pipeline using Stable Diffusion
 (#888)

* Add CycleDiffusion pipeline for Stable Diffusion

* Add the option of passing noise to DDIMScheduler

Add the option of providing the noise itself to DDIMScheduler, instead of the random seed generator.

* Update README.md

* Update README.md

* Update pipeline_stable_diffusion_cycle_diffusion.py

* Update pipeline_stable_diffusion_cycle_diffusion.py

* Update pipeline_stable_diffusion_cycle_diffusion.py

* Update pipeline_stable_diffusion_cycle_diffusion.py

* Update scheduling_ddim.py

* Update import format

* Update pipeline_stable_diffusion_cycle_diffusion.py

* Update scheduling_ddim.py

* Update src/diffusers/schedulers/scheduling_ddim.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* Update src/diffusers/schedulers/scheduling_ddim.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* Update src/diffusers/schedulers/scheduling_ddim.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* Update src/diffusers/schedulers/scheduling_ddim.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* Update src/diffusers/schedulers/scheduling_ddim.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* Update scheduling_ddim.py

* Update scheduling_ddim.py

* Update scheduling_ddim.py

* add two tests

* Update pipeline_stable_diffusion_cycle_diffusion.py

* Update pipeline_stable_diffusion_cycle_diffusion.py

* Update README.md

* Rename pipeline name as suggested in the latest reviewer comment

* Update test_pipelines.py

* Update test_pipelines.py

* Update test_pipelines.py

* Update pipeline_stable_diffusion_cycle_diffusion.py

* Remove the generator

This generator does not control all randomness during sampling, which can be misleading.

* Update optimal hyperparameters

* Update src/diffusers/pipelines/stable_diffusion/README.md

Co-authored-by: Suraj Patil <surajp815@gmail.com>

* Update src/diffusers/pipelines/stable_diffusion/README.md

Co-authored-by: Suraj Patil <surajp815@gmail.com>

* Update src/diffusers/pipelines/stable_diffusion/README.md

Co-authored-by: Suraj Patil <surajp815@gmail.com>

* Apply suggestions from code review

* uP

* Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_cycle_diffusion.py

Co-authored-by: Suraj Patil <surajp815@gmail.com>

* up

* up

* Replace assert with ValueError

* finish docs

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Suraj Patil <surajp815@gmail.com>
---
 docs/source/_toctree.yml                      |   2 +
 docs/source/api/pipelines/cycle_diffusion.mdx |  99 ++++
 docs/source/api/pipelines/overview.mdx        |  29 +-
 docs/source/index.mdx                         |   1 +
 src/diffusers/__init__.py                     |   1 +
 src/diffusers/pipelines/__init__.py           |   1 +
 .../pipelines/stable_diffusion/README.md      |  71 +++
 .../pipelines/stable_diffusion/__init__.py    |   1 +
 .../pipeline_cycle_diffusion.py               | 527 ++++++++++++++++++
 src/diffusers/schedulers/scheduling_ddim.py   |  17 +-
 .../dummy_torch_and_transformers_objects.py   |  15 +
 .../stable_diffusion/test_cycle_diffusion.py  | 348 ++++++++++++
 12 files changed, 1097 insertions(+), 15 deletions(-)
 create mode 100644 docs/source/api/pipelines/cycle_diffusion.mdx
 create mode 100644 src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
 create mode 100644 tests/pipelines/stable_diffusion/test_cycle_diffusion.py

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 70d64b80de7a..d8efb5eee3d3 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -78,6 +78,8 @@
   - sections:
     - local: api/pipelines/overview
       title: "Overview"
+    - local: api/pipelines/cycle_diffusion
+      title: "Cycle Diffusion"
     - local: api/pipelines/ddim
       title: "DDIM"
     - local: api/pipelines/ddpm
diff --git a/docs/source/api/pipelines/cycle_diffusion.mdx b/docs/source/api/pipelines/cycle_diffusion.mdx
new file mode 100644
index 000000000000..50d2a5c87eca
--- /dev/null
+++ b/docs/source/api/pipelines/cycle_diffusion.mdx
@@ -0,0 +1,99 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Cycle Diffusion
+
+## Overview
+
+Cycle Diffusion is a Text-Guided Image-to-Image Generation model proposed in [Unifying Diffusion Models' Latent Space, with Applications to CycleDiffusion and Guidance](https://arxiv.org/abs/2210.05559) by Chen Henry Wu, Fernando De la Torre.
+
+The abstract of the paper is the following:
+
+*Diffusion models have achieved unprecedented performance in generative modeling. The commonly-adopted formulation of the latent code of diffusion models is a sequence of gradually denoised samples, as opposed to the simpler (e.g., Gaussian) latent space of GANs, VAEs, and normalizing flows. This paper provides an alternative, Gaussian formulation of the latent space of various diffusion models, as well as an invertible DPM-Encoder that maps images into the latent space. While our formulation is purely based on the definition of diffusion models, we demonstrate several intriguing consequences. (1) Empirically, we observe that a common latent space emerges from two diffusion models trained independently on related domains. In light of this finding, we propose CycleDiffusion, which uses DPM-Encoder for unpaired image-to-image translation. Furthermore, applying CycleDiffusion to text-to-image diffusion models, we show that large-scale text-to-image diffusion models can be used as zero-shot image-to-image editors. (2) One can guide pre-trained diffusion models and GANs by controlling the latent codes in a unified, plug-and-play formulation based on energy-based models. Using the CLIP model and a face recognition model as guidance, we demonstrate that diffusion models have better coverage of low-density sub-populations and individuals than GANs.*
+
+*Tips*:
+- The Cycle Diffusion pipeline is fully compatible with any [Stable Diffusion](./stable_diffusion) checkpoints
+- Currently Cycle Diffusion only works with the [`DDIMScheduler`].
+
+*Example*:
+
+In the following we should how to best use the [`CycleDiffusionPipeline`]
+
+```python
+import requests
+import torch
+from PIL import Image
+from io import BytesIO
+
+from diffusers import CycleDiffusionPipeline, DDIMScheduler
+
+# load the pipeline
+# make sure you're logged in with `huggingface-cli login`
+model_id_or_path = "CompVis/stable-diffusion-v1-4"
+scheduler = DDIMScheduler.from_config(model_id_or_path, subfolder="scheduler")
+pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path, scheduler=scheduler).to("cuda")
+
+# let's download an initial image
+url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/An%20astronaut%20riding%20a%20horse.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((512, 512))
+init_image.save("horse.png")
+
+# let's specify a prompt
+source_prompt = "An astronaut riding a horse"
+prompt = "An astronaut riding an elephant"
+
+# call the pipeline
+image = pipe(
+    prompt=prompt,
+    source_prompt=source_prompt,
+    init_image=init_image,
+    num_inference_steps=100,
+    eta=0.1,
+    strength=0.8,
+    guidance_scale=2,
+    source_guidance_scale=1,
+).images[0]
+
+image.save("horse_to_elephant.png")
+
+# let's try another example
+# See more samples at the original repo: https://github.com/ChenWu98/cycle-diffusion
+url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/A%20black%20colored%20car.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((512, 512))
+init_image.save("black.png")
+
+source_prompt = "A black colored car"
+prompt = "A blue colored car"
+
+# call the pipeline
+torch.manual_seed(0)
+image = pipe(
+    prompt=prompt,
+    source_prompt=source_prompt,
+    init_image=init_image,
+    num_inference_steps=100,
+    eta=0.1,
+    strength=0.85,
+    guidance_scale=3,
+    source_guidance_scale=1,
+).images[0]
+
+image.save("black_to_blue.png")
+```
+
+## CycleDiffusionPipeline
+[[autodoc]] CycleDiffusionPipeline
+	- __call__
diff --git a/docs/source/api/pipelines/overview.mdx b/docs/source/api/pipelines/overview.mdx
index 5a15473cf19c..d68961a2fc06 100644
--- a/docs/source/api/pipelines/overview.mdx
+++ b/docs/source/api/pipelines/overview.mdx
@@ -41,21 +41,24 @@ If you are looking for *official* training examples, please have a look at [exam
 The following table summarizes all officially supported pipelines, their corresponding paper, and if 
 available a colab notebook to directly try them out.
 
+
 | Pipeline | Paper | Tasks | Colab
 |---|---|:---:|:---:|
-| [ddpm](./ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
-| [ddim](./ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
-| [latent_diffusion](./latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | 
-| [latent_diffusion_uncond](./latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | 
-| [pndm](./pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation | 
-| [score_sde_ve](./score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
-| [score_sde_vp](./score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
-| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
-| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
-| [stable_diffusion](./stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
-| [stochastic_karras_ve](./stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation | 
-| [vq_diffusion](./vq_diffusion) | [**Vector Quantized Diffusion Model for Text-to-Image Synthesis**](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation |
-| [repaint](./repaint)                                 | [**RePaint: Inpainting using Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2201.09865)                   |           Image Inpainting            | 
+| [cycle_diffusion](./api/pipelines/cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
+| [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
+| [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
+| [ddim](./api/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
+| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | 
+| [latent_diffusion_uncond](./api/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | 
+| [pndm](./api/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation | 
+| [score_sde_ve](./api/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
+| [score_sde_vp](./api/pipelines/score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
+| [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+| [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+| [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation | 
+| [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation | 
+
 
 **Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers. 
 
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 62a3e88f173d..bae507ac11d3 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -34,6 +34,7 @@ available a colab notebook to directly try them out.
 
 | Pipeline | Paper | Tasks | Colab
 |---|---|:---:|:---:|
+| [cycle_diffusion](./api/pipelines/cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
 | [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
 | [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
 | [ddim](./api/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 9c954c38165b..3f3f3b56a253 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -63,6 +63,7 @@
 
 if is_torch_available() and is_transformers_available():
     from .pipelines import (
+        CycleDiffusionPipeline,
         LDMTextToImagePipeline,
         StableDiffusionImg2ImgPipeline,
         StableDiffusionInpaintPipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index bb3440b2bfbc..eb0635f6ee07 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -16,6 +16,7 @@
 if is_torch_available() and is_transformers_available():
     from .latent_diffusion import LDMTextToImagePipeline
     from .stable_diffusion import (
+        CycleDiffusionPipeline,
         StableDiffusionImg2ImgPipeline,
         StableDiffusionInpaintPipeline,
         StableDiffusionInpaintPipelineLegacy,
diff --git a/src/diffusers/pipelines/stable_diffusion/README.md b/src/diffusers/pipelines/stable_diffusion/README.md
index eaa441329a5c..a76e4c668272 100644
--- a/src/diffusers/pipelines/stable_diffusion/README.md
+++ b/src/diffusers/pipelines/stable_diffusion/README.md
@@ -103,3 +103,74 @@ image = pipe(prompt).sample[0]
     
 image.save("astronaut_rides_horse.png")
 ```
+
+### CycleDiffusion using Stable Diffusion and DDIM scheduler
+
+```python
+import requests
+import torch
+from PIL import Image
+from io import BytesIO
+
+from diffusers import CycleDiffusionPipeline, DDIMScheduler
+
+
+# load the scheduler. CycleDiffusion only supports stochastic schedulers.
+
+# load the pipeline
+# make sure you're logged in with `huggingface-cli login`
+model_id_or_path = "CompVis/stable-diffusion-v1-4"
+scheduler = DDIMScheduler.from_config(model_id_or_path, subfolder="scheduler")
+pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path, scheduler=scheduler).to("cuda")
+
+# let's download an initial image
+url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/An%20astronaut%20riding%20a%20horse.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((512, 512))
+init_image.save("horse.png")
+
+# let's specify a prompt
+source_prompt = "An astronaut riding a horse"
+prompt = "An astronaut riding an elephant"
+
+# call the pipeline
+image = pipe(
+    prompt=prompt,
+    source_prompt=source_prompt,
+    init_image=init_image,
+    num_inference_steps=100,
+    eta=0.1,
+    strength=0.8,
+    guidance_scale=2,
+    source_guidance_scale=1,
+).images[0]
+
+image.save("horse_to_elephant.png")
+
+# let's try another example
+# See more samples at the original repo: https://github.com/ChenWu98/cycle-diffusion
+url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/A%20black%20colored%20car.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((512, 512))
+init_image.save("black.png")
+
+source_prompt = "A black colored car"
+prompt = "A blue colored car"
+
+# call the pipeline
+torch.manual_seed(0)
+image = pipe(
+    prompt=prompt,
+    source_prompt=source_prompt,
+    init_image=init_image,
+    num_inference_steps=100,
+    eta=0.1,
+    strength=0.85,
+    guidance_scale=3,
+    source_guidance_scale=1,
+).images[0]
+
+image.save("black_to_blue.png")
+```
diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
index 5a452138b7a8..6623929f8648 100644
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -28,6 +28,7 @@ class StableDiffusionPipelineOutput(BaseOutput):
 
 
 if is_transformers_available() and is_torch_available():
+    from .pipeline_cycle_diffusion import CycleDiffusionPipeline
     from .pipeline_stable_diffusion import StableDiffusionPipeline
     from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
     from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
new file mode 100644
index 000000000000..f38e57983f15
--- /dev/null
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -0,0 +1,527 @@
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+
+import PIL
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...pipeline_utils import DiffusionPipeline
+from ...schedulers import DDIMScheduler
+from ...utils import deprecate, logging
+from . import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def preprocess(image):
+    w, h = image.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+
+def posterior_sample(scheduler, latents, timestep, clean_latents, eta):
+    # 1. get previous step value (=t-1)
+    prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
+
+    if prev_timestep <= 0:
+        return clean_latents
+
+    # 2. compute alphas, betas
+    alpha_prod_t = scheduler.alphas_cumprod[timestep]
+    alpha_prod_t_prev = (
+        scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+    )
+
+    variance = scheduler._get_variance(timestep, prev_timestep)
+    std_dev_t = eta * variance ** (0.5)
+
+    # direction pointing to x_t
+    e_t = (latents - alpha_prod_t ** (0.5) * clean_latents) / (1 - alpha_prod_t) ** (0.5)
+    dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * e_t
+    noise = std_dev_t * torch.randn(clean_latents.shape, dtype=clean_latents.dtype, device=clean_latents.device)
+    prev_latents = alpha_prod_t_prev ** (0.5) * clean_latents + dir_xt + noise
+
+    return prev_latents
+
+
+def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta):
+    # 1. get previous step value (=t-1)
+    prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
+
+    # 2. compute alphas, betas
+    alpha_prod_t = scheduler.alphas_cumprod[timestep]
+    alpha_prod_t_prev = (
+        scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+    )
+
+    beta_prod_t = 1 - alpha_prod_t
+
+    # 3. compute predicted original sample from predicted noise also called
+    # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
+
+    # 4. Clip "predicted x_0"
+    if scheduler.config.clip_sample:
+        pred_original_sample = torch.clamp(pred_original_sample, -1, 1)
+
+    # 5. compute variance: "sigma_t(η)" -> see formula (16)
+    # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+    variance = scheduler._get_variance(timestep, prev_timestep)
+    std_dev_t = eta * variance ** (0.5)
+
+    # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred
+
+    noise = (prev_latents - (alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction)) / (
+        variance ** (0.5) * eta
+    )
+    return noise
+
+
+class CycleDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-guided image to image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: DDIMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warn(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `set_attention_slice`
+        self.enable_attention_slicing(None)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        source_prompt: Union[str, List[str]],
+        init_image: Union[torch.FloatTensor, PIL.Image.Image],
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        source_guidance_scale: Optional[float] = 1,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.1,
+        generator: Optional[torch.Generator] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            init_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
+                `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter will be modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            source_guidance_scale (`float`, *optional*, defaults to 1):
+                Guidance scale for the source prompt. This is useful to control the amount of influence the source
+                prompt for encoding.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.1):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if batch_size != 1:
+            raise ValueError(
+                "At the moment only `batch_size=1` is supported for prompts, but you seem to have passed multiple"
+                f" prompts: {prompt}. Please make sure to pass only a single prompt."
+            )
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        if isinstance(init_image, PIL.Image.Image):
+            init_image = preprocess(init_image)
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        source_text_inputs = self.tokenizer(
+            source_prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        source_text_input_ids = source_text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        if source_text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(source_text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            source_text_input_ids = source_text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+        source_text_embeddings = self.text_encoder(source_text_input_ids.to(self.device))[0]
+
+        # duplicate text embeddings for each generation per prompt
+        text_embeddings = text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+        source_text_embeddings = source_text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+
+        # get unconditional embeddings for classifier free guidance
+        uncond_tokens = [""]
+
+        max_length = text_input_ids.shape[-1]
+        uncond_input = self.tokenizer(
+            uncond_tokens,
+            padding="max_length",
+            max_length=max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+        # duplicate unconditional embeddings for each generation per prompt
+        uncond_embeddings = uncond_embeddings.repeat_interleave(batch_size * num_images_per_prompt, dim=0)
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        source_uncond_tokens = [""]
+
+        max_length = source_text_input_ids.shape[-1]
+        source_uncond_input = self.tokenizer(
+            source_uncond_tokens,
+            padding="max_length",
+            max_length=max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        source_uncond_embeddings = self.text_encoder(source_uncond_input.input_ids.to(self.device))[0]
+
+        # duplicate unconditional embeddings for each generation per prompt
+        source_uncond_embeddings = source_uncond_embeddings.repeat_interleave(
+            batch_size * num_images_per_prompt, dim=0
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        source_text_embeddings = torch.cat([source_uncond_embeddings, source_text_embeddings])
+
+        # encode the init image into latents and scale the latents
+        latents_dtype = text_embeddings.dtype
+        init_image = init_image.to(device=self.device, dtype=latents_dtype)
+        init_latent_dist = self.vae.encode(init_image).latent_dist
+        init_latents = init_latent_dist.sample(generator=generator)
+        init_latents = 0.18215 * init_latents
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        if len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many init images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = len(prompt) // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
+        elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
+
+        # get the original timestep using init_timestep
+        offset = self.scheduler.config.get("steps_offset", 0)
+        init_timestep = int(num_inference_steps * strength) + offset
+        init_timestep = min(init_timestep, num_inference_steps)
+
+        timesteps = self.scheduler.timesteps[-init_timestep]
+        timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt, device=self.device)
+
+        # add noise to latents using the timesteps
+        noise = torch.randn(init_latents.shape, generator=generator, device=self.device, dtype=latents_dtype)
+        clean_latents = init_latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timesteps)
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+
+        if not (accepts_eta and (0 < eta <= 1)):
+            raise ValueError(
+                "Currently, only the DDIM scheduler is supported. Please make sure that `pipeline.scheduler` is of"
+                f" type {DDIMScheduler.__class__} and not {self.scheduler.__class__}."
+            )
+
+        extra_step_kwargs["eta"] = eta
+
+        latents = init_latents
+        source_latents = init_latents
+
+        t_start = max(num_inference_steps - init_timestep + offset, 0)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps = self.scheduler.timesteps[t_start:].to(self.device)
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2)
+            source_latent_model_input = torch.cat([source_latents] * 2)
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            source_latent_model_input = self.scheduler.scale_model_input(source_latent_model_input, t)
+
+            # predict the noise residual
+            concat_latent_model_input = torch.stack(
+                [
+                    source_latent_model_input[0],
+                    latent_model_input[0],
+                    source_latent_model_input[1],
+                    latent_model_input[1],
+                ],
+                dim=0,
+            )
+            concat_text_embeddings = torch.stack(
+                [
+                    source_text_embeddings[0],
+                    text_embeddings[0],
+                    source_text_embeddings[1],
+                    text_embeddings[1],
+                ],
+                dim=0,
+            )
+            concat_noise_pred = self.unet(
+                concat_latent_model_input, t, encoder_hidden_states=concat_text_embeddings
+            ).sample
+
+            # perform guidance
+            (
+                source_noise_pred_uncond,
+                noise_pred_uncond,
+                source_noise_pred_text,
+                noise_pred_text,
+            ) = concat_noise_pred.chunk(4, dim=0)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            source_noise_pred = source_noise_pred_uncond + source_guidance_scale * (
+                source_noise_pred_text - source_noise_pred_uncond
+            )
+
+            # Sample source_latents from the posterior distribution.
+            prev_source_latents = posterior_sample(
+                self.scheduler, source_latents, t, clean_latents, **extra_step_kwargs
+            )
+            # Compute noise.
+            noise = compute_noise(
+                self.scheduler, prev_source_latents, source_latents, t, source_noise_pred, **extra_step_kwargs
+            )
+            source_latents = prev_source_latents
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred, t, latents, variance_noise=noise, **extra_step_kwargs
+            ).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
+                self.device
+            )
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+            )
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 23648d1bc3c0..62ee9c0244ae 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -208,6 +208,7 @@ def step(
         eta: float = 0.0,
         use_clipped_model_output: bool = False,
         generator=None,
+        variance_noise: Optional[torch.FloatTensor] = None,
         return_dict: bool = True,
     ) -> Union[DDIMSchedulerOutput, Tuple]:
         """
@@ -225,6 +226,9 @@ def step(
                 `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would
                 coincide with the one provided as input and `use_clipped_model_output` will have not effect.
             generator: random number generator.
+            variance_noise (`torch.FloatTensor`): instead of generating noise for the variance using `generator`, we
+                can directly provide the noise for the variance itself. This is useful for methods such as
+                CycleDiffusion. (https://arxiv.org/abs/2210.05559)
             return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class
 
         Returns:
@@ -284,8 +288,17 @@ def step(
         if eta > 0:
             # randn_like does not support generator https://github.com/pytorch/pytorch/issues/27072
             device = model_output.device if torch.is_tensor(model_output) else "cpu"
-            noise = torch.randn(model_output.shape, dtype=model_output.dtype, generator=generator).to(device)
-            variance = self._get_variance(timestep, prev_timestep) ** (0.5) * eta * noise
+            if variance_noise is not None and generator is not None:
+                raise ValueError(
+                    "Cannot pass both generator and variance_noise. Please make sure that either `generator` or"
+                    " `variance_noise` stays `None`."
+                )
+
+            if variance_noise is None:
+                variance_noise = torch.randn(model_output.shape, dtype=model_output.dtype, generator=generator).to(
+                    device
+                )
+            variance = self._get_variance(timestep, prev_timestep) ** (0.5) * eta * variance_noise
 
             prev_sample = prev_sample + variance
 
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index ea85a8f57e8f..63e8a60f74c2 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -4,6 +4,21 @@
 from ..utils import DummyObject, requires_backends
 
 
+class CycleDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class LDMTextToImagePipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
new file mode 100644
index 000000000000..0bddd63807d6
--- /dev/null
+++ b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
@@ -0,0 +1,348 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import AutoencoderKL, CycleDiffusionPipeline, DDIMScheduler, UNet2DConditionModel, UNet2DModel, VQModel
+from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class CycleDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @property
+    def dummy_image(self):
+        batch_size = 1
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
+        return image
+
+    @property
+    def dummy_uncond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
+        return model
+
+    @property
+    def dummy_cond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
+    @property
+    def dummy_cond_unet_inpaint(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=9,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
+    @property
+    def dummy_vq_model(self):
+        torch.manual_seed(0)
+        model = VQModel(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=3,
+        )
+        return model
+
+    @property
+    def dummy_vae(self):
+        torch.manual_seed(0)
+        model = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        return model
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModel(config)
+
+    @property
+    def dummy_extractor(self):
+        def extract(*args, **kwargs):
+            class Out:
+                def __init__(self):
+                    self.pixel_values = torch.ones([0])
+
+                def to(self, device):
+                    self.pixel_values.to(device)
+                    return self
+
+            return Out()
+
+        return extract
+
+    def test_stable_diffusion_cycle(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            num_train_timesteps=1000,
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = CycleDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        source_prompt = "An astronaut riding a horse"
+        prompt = "An astronaut riding an elephant"
+        init_image = self.dummy_image.to(device)
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe(
+            prompt=prompt,
+            source_prompt=source_prompt,
+            generator=generator,
+            num_inference_steps=2,
+            init_image=init_image,
+            eta=0.1,
+            strength=0.8,
+            guidance_scale=3,
+            source_guidance_scale=1,
+            output_type="np",
+        )
+        images = output.images
+
+        image_slice = images[0, -3:, -3:, -1]
+
+        assert images.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.4459, 0.4943, 0.4544, 0.6643, 0.5474, 0.4327, 0.5701, 0.5959, 0.5179])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
+    def test_stable_diffusion_cycle_fp16(self):
+        unet = self.dummy_cond_unet
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            num_train_timesteps=1000,
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        unet = unet.half()
+        vae = vae.half()
+        bert = bert.half()
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = CycleDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        source_prompt = "An astronaut riding a horse"
+        prompt = "An astronaut riding an elephant"
+        init_image = self.dummy_image.to(torch_device)
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = sd_pipe(
+            prompt=prompt,
+            source_prompt=source_prompt,
+            generator=generator,
+            num_inference_steps=2,
+            init_image=init_image,
+            eta=0.1,
+            strength=0.8,
+            guidance_scale=3,
+            source_guidance_scale=1,
+            output_type="np",
+        )
+        images = output.images
+
+        image_slice = images[0, -3:, -3:, -1]
+
+        assert images.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.3506, 0.4543, 0.446, 0.4575, 0.5195, 0.4155, 0.5273, 0.518, 0.4116])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+
+@slow
+@require_torch_gpu
+class CycleDiffusionPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_cycle_diffusion_pipeline_fp16(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/cycle-diffusion/black_colored_car.png"
+        )
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/cycle-diffusion/blue_colored_car_fp16.npy"
+        )
+        init_image = init_image.resize((512, 512))
+
+        model_id = "CompVis/stable-diffusion-v1-4"
+        scheduler = DDIMScheduler.from_config(model_id, subfolder="scheduler")
+        pipe = CycleDiffusionPipeline.from_pretrained(
+            model_id, scheduler=scheduler, safety_checker=None, torch_dtype=torch.float16, revision="fp16"
+        )
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        source_prompt = "A black colored car"
+        prompt = "A blue colored car"
+
+        torch.manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            source_prompt=source_prompt,
+            init_image=init_image,
+            num_inference_steps=100,
+            eta=0.1,
+            strength=0.85,
+            guidance_scale=3,
+            source_guidance_scale=1,
+            output_type="np",
+        )
+        image = output.images
+
+        # the values aren't exactly equal, but the images look the same visually
+        assert np.abs(image - expected_image).max() < 1e-2
+
+    def test_cycle_diffusion_pipeline(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/cycle-diffusion/black_colored_car.png"
+        )
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/cycle-diffusion/blue_colored_car.npy"
+        )
+        init_image = init_image.resize((512, 512))
+
+        model_id = "CompVis/stable-diffusion-v1-4"
+        scheduler = DDIMScheduler.from_config(model_id, subfolder="scheduler")
+        pipe = CycleDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, safety_checker=None)
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        source_prompt = "A black colored car"
+        prompt = "A blue colored car"
+
+        torch.manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            source_prompt=source_prompt,
+            init_image=init_image,
+            num_inference_steps=100,
+            eta=0.1,
+            strength=0.85,
+            guidance_scale=3,
+            source_guidance_scale=1,
+            output_type="np",
+        )
+        image = output.images
+
+        assert np.abs(image - expected_image).max() < 1e-2

From 08a6dc8a5840e0cc09e65e71e9647321ab9bb254 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Sat, 5 Nov 2022 22:17:41 +0100
Subject: [PATCH 136/269] Flax: Flip sin to cos in time embeddings (#1149)

Flip sin to cos in t embeddings.

This was assumed in the previous implementation, but now the default is
the opposite.

Fixes #1145.
---
 src/diffusers/models/embeddings_flax.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/models/embeddings_flax.py b/src/diffusers/models/embeddings_flax.py
index 1e2272c1fe70..bf7d54b82ec2 100644
--- a/src/diffusers/models/embeddings_flax.py
+++ b/src/diffusers/models/embeddings_flax.py
@@ -88,4 +88,6 @@ class FlaxTimesteps(nn.Module):
 
     @nn.compact
     def __call__(self, timesteps):
-        return get_sinusoidal_embeddings(timesteps, embedding_dim=self.dim, freq_shift=self.freq_shift)
+        return get_sinusoidal_embeddings(
+            timesteps, embedding_dim=self.dim, freq_shift=self.freq_shift, flip_sin_to_cos=True
+        )

From b4a1ed85440d4d9c1cafbe118ca6c034000c85f9 Mon Sep 17 00:00:00 2001
From: Cheng Lu <lucheng.lc15@gmail.com>
Date: Mon, 7 Nov 2022 05:49:55 +0800
Subject: [PATCH 137/269] Add multistep DPM-Solver discrete scheduler (#1132)

* add dpmsolver discrete pytorch scheduler

* fix some typos in dpm-solver pytorch

* add dpm-solver pytorch in stable-diffusion pipeline

* add jax/flax version dpm-solver

* change code style

* change code style

* add docs

* add `add_noise` method for dpmsolver

* add pytorch unit test for dpmsolver

* add dummy object for pytorch dpmsolver

* Update src/diffusers/schedulers/scheduling_dpmsolver_discrete.py

Co-authored-by: Suraj Patil <surajp815@gmail.com>

* Update tests/test_config.py

Co-authored-by: Suraj Patil <surajp815@gmail.com>

* Update tests/test_config.py

Co-authored-by: Suraj Patil <surajp815@gmail.com>

* resolve the code comments

* rename the file

* change class name

* fix code style

* add auto docs for dpmsolver multistep

* add more explanations for the stabilizing trick (for steps < 15)

* delete the dummy file

* change the API name of predict_epsilon, algorithm_type and solver_type

* add compatible lists

Co-authored-by: Suraj Patil <surajp815@gmail.com>
---
 docs/source/api/schedulers.mdx                |   6 +
 src/diffusers/__init__.py                     |   2 +
 .../pipeline_flax_stable_diffusion.py         |  14 +-
 .../pipeline_stable_diffusion.py              |   8 +-
 src/diffusers/schedulers/__init__.py          |   2 +
 src/diffusers/schedulers/scheduling_ddim.py   |   1 +
 src/diffusers/schedulers/scheduling_ddpm.py   |   1 +
 .../scheduling_dpmsolver_multistep.py         | 506 +++++++++++++++
 .../scheduling_dpmsolver_multistep_flax.py    | 590 ++++++++++++++++++
 .../scheduling_euler_ancestral_discrete.py    |   1 +
 .../schedulers/scheduling_euler_discrete.py   |   1 +
 .../schedulers/scheduling_lms_discrete.py     |   1 +
 src/diffusers/schedulers/scheduling_pndm.py   |   1 +
 src/diffusers/utils/dummy_flax_objects.py     |  15 +
 src/diffusers/utils/dummy_pt_objects.py       |  15 +
 tests/test_config.py                          |  21 +-
 tests/test_scheduler.py                       | 182 ++++++
 17 files changed, 1362 insertions(+), 5 deletions(-)
 create mode 100644 src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
 create mode 100644 src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py

diff --git a/docs/source/api/schedulers.mdx b/docs/source/api/schedulers.mdx
index f073f6b37912..12575a5ecae2 100644
--- a/docs/source/api/schedulers.mdx
+++ b/docs/source/api/schedulers.mdx
@@ -70,6 +70,12 @@ Original paper can be found [here](https://arxiv.org/abs/2010.02502).
 
 [[autodoc]] DDPMScheduler
 
+#### Multistep DPM-Solver
+
+Original paper can be found [here](https://arxiv.org/abs/2206.00927) and the [improved version](https://arxiv.org/abs/2211.01095). The original implementation can be found [here](https://github.com/LuChengTHU/dpm-solver).
+
+[[autodoc]] DPMSolverMultistepScheduler
+
 #### Variance exploding, stochastic sampling from Karras et. al
 
 Original paper can be found [here](https://arxiv.org/abs/2006.11239).
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 3f3f3b56a253..da56dc888138 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -42,6 +42,7 @@
     from .schedulers import (
         DDIMScheduler,
         DDPMScheduler,
+        DPMSolverMultistepScheduler,
         EulerAncestralDiscreteScheduler,
         EulerDiscreteScheduler,
         IPNDMScheduler,
@@ -92,6 +93,7 @@
     from .schedulers import (
         FlaxDDIMScheduler,
         FlaxDDPMScheduler,
+        FlaxDPMSolverMultistepScheduler,
         FlaxKarrasVeScheduler,
         FlaxLMSDiscreteScheduler,
         FlaxPNDMScheduler,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
index fe0e284c6720..5a910f8453ce 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
@@ -14,7 +14,12 @@
 
 from ...models import FlaxAutoencoderKL, FlaxUNet2DConditionModel
 from ...pipeline_flax_utils import FlaxDiffusionPipeline
-from ...schedulers import FlaxDDIMScheduler, FlaxLMSDiscreteScheduler, FlaxPNDMScheduler
+from ...schedulers import (
+    FlaxDDIMScheduler,
+    FlaxDPMSolverMultistepScheduler,
+    FlaxLMSDiscreteScheduler,
+    FlaxPNDMScheduler,
+)
 from ...utils import logging
 from . import FlaxStableDiffusionPipelineOutput
 from .safety_checker_flax import FlaxStableDiffusionSafetyChecker
@@ -43,7 +48,8 @@ class FlaxStableDiffusionPipeline(FlaxDiffusionPipeline):
         unet ([`FlaxUNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
             A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
-            [`FlaxDDIMScheduler`], [`FlaxLMSDiscreteScheduler`], or [`FlaxPNDMScheduler`].
+            [`FlaxDDIMScheduler`], [`FlaxLMSDiscreteScheduler`], [`FlaxPNDMScheduler`], or
+            [`FlaxDPMSolverMultistepScheduler`].
         safety_checker ([`FlaxStableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
             Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
@@ -57,7 +63,9 @@ def __init__(
         text_encoder: FlaxCLIPTextModel,
         tokenizer: CLIPTokenizer,
         unet: FlaxUNet2DConditionModel,
-        scheduler: Union[FlaxDDIMScheduler, FlaxPNDMScheduler, FlaxLMSDiscreteScheduler],
+        scheduler: Union[
+            FlaxDDIMScheduler, FlaxPNDMScheduler, FlaxLMSDiscreteScheduler, FlaxDPMSolverMultistepScheduler
+        ],
         safety_checker: FlaxStableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
         dtype: jnp.dtype = jnp.float32,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 1ccc87804e68..094841f9778e 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -11,6 +11,7 @@
 from ...pipeline_utils import DiffusionPipeline
 from ...schedulers import (
     DDIMScheduler,
+    DPMSolverMultistepScheduler,
     EulerAncestralDiscreteScheduler,
     EulerDiscreteScheduler,
     LMSDiscreteScheduler,
@@ -59,7 +60,12 @@ def __init__(
         tokenizer: CLIPTokenizer,
         unet: UNet2DConditionModel,
         scheduler: Union[
-            DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler, EulerDiscreteScheduler, EulerAncestralDiscreteScheduler
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
         ],
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
diff --git a/src/diffusers/schedulers/__init__.py b/src/diffusers/schedulers/__init__.py
index 1be541ba8b66..6217bfcd6985 100644
--- a/src/diffusers/schedulers/__init__.py
+++ b/src/diffusers/schedulers/__init__.py
@@ -19,6 +19,7 @@
 if is_torch_available():
     from .scheduling_ddim import DDIMScheduler
     from .scheduling_ddpm import DDPMScheduler
+    from .scheduling_dpmsolver_multistep import DPMSolverMultistepScheduler
     from .scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler
     from .scheduling_euler_discrete import EulerDiscreteScheduler
     from .scheduling_ipndm import IPNDMScheduler
@@ -35,6 +36,7 @@
 if is_flax_available():
     from .scheduling_ddim_flax import FlaxDDIMScheduler
     from .scheduling_ddpm_flax import FlaxDDPMScheduler
+    from .scheduling_dpmsolver_multistep_flax import FlaxDPMSolverMultistepScheduler
     from .scheduling_karras_ve_flax import FlaxKarrasVeScheduler
     from .scheduling_lms_discrete_flax import FlaxLMSDiscreteScheduler
     from .scheduling_pndm_flax import FlaxPNDMScheduler
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 62ee9c0244ae..8d4407c16c30 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -115,6 +115,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
         "LMSDiscreteScheduler",
         "EulerDiscreteScheduler",
         "EulerAncestralDiscreteScheduler",
+        "DPMSolverMultistepScheduler",
     ]
 
     @register_to_config
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index 114a86b4320e..171c9598eba2 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -108,6 +108,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
         "LMSDiscreteScheduler",
         "EulerDiscreteScheduler",
         "EulerAncestralDiscreteScheduler",
+        "DPMSolverMultistepScheduler",
     ]
 
     @register_to_config
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
new file mode 100644
index 000000000000..d166354809b0
--- /dev/null
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -0,0 +1,506 @@
+# Copyright 2022 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils import SchedulerMixin, SchedulerOutput
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+
+    def alpha_bar(time_step):
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    DPM-Solver (and the improved version DPM-Solver++) is a fast dedicated high-order solver for diffusion ODEs with
+    the convergence order guarantee. Empirically, sampling by DPM-Solver with only 20 steps can generate high-quality
+    samples, and it can generate quite good samples even in only 10 steps.
+
+    For more details, see the original paper: https://arxiv.org/abs/2206.00927 and https://arxiv.org/abs/2211.01095
+
+    Currently, we support the multistep DPM-Solver for both noise prediction models and data prediction models. We
+    recommend to use `solver_order=2` for guided sampling, and `solver_order=3` for unconditional sampling.
+
+    We also support the "dynamic thresholding" method in Imagen (https://arxiv.org/abs/2205.11487). For pixel-space
+    diffusion models, you can set both `algorithm_type="dpmsolver++"` and `thresholding=True` to use the dynamic
+    thresholding. Note that the thresholding method is unsuitable for latent-space diffusion models (such as
+    stable-diffusion).
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+    [`~ConfigMixin.from_config`] functions.
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        solver_order (`int`, default `2`):
+            the order of DPM-Solver; can be `1` or `2` or `3`. We recommend to use `solver_order=2` for guided
+            sampling, and `solver_order=3` for unconditional sampling.
+        predict_epsilon (`bool`, default `True`):
+            we currently support both the noise prediction model and the data prediction model. If the model predicts
+            the noise / epsilon, set `predict_epsilon` to `True`. If the model predicts the data / x0 directly, set
+            `predict_epsilon` to `False`.
+        thresholding (`bool`, default `False`):
+            whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
+            For pixel-space diffusion models, you can set both `algorithm_type=dpmsolver++` and `thresholding=True` to
+            use the dynamic thresholding. Note that the thresholding method is unsuitable for latent-space diffusion
+            models (such as stable-diffusion).
+        dynamic_thresholding_ratio (`float`, default `0.995`):
+            the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
+            (https://arxiv.org/abs/2205.11487).
+        sample_max_value (`float`, default `1.0`):
+            the threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+            `algorithm_type="dpmsolver++`.
+        algorithm_type (`str`, default `dpmsolver++`):
+            the algorithm type for the solver. Either `dpmsolver` or `dpmsolver++`. The `dpmsolver` type implements the
+            algorithms in https://arxiv.org/abs/2206.00927, and the `dpmsolver++` type implements the algorithms in
+            https://arxiv.org/abs/2211.01095. We recommend to use `dpmsolver++` with `solver_order=2` for guided
+            sampling (e.g. stable-diffusion).
+        solver_type (`str`, default `midpoint`):
+            the solver type for the second-order solver. Either `midpoint` or `heun`. The solver type slightly affects
+            the sample quality, especially for small number of steps. We empirically find that `midpoint` solvers are
+            slightly better, so we recommend to use the `midpoint` type.
+        lower_order_final (`bool`, default `True`):
+            whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. We empirically
+            find this trick can stabilize the sampling of DPM-Solver for steps < 15, especially for steps <= 10.
+
+    """
+
+    _compatible_classes = [
+        "DDIMScheduler",
+        "DDPMScheduler",
+        "PNDMScheduler",
+        "LMSDiscreteScheduler",
+        "EulerDiscreteScheduler",
+        "EulerAncestralDiscreteScheduler",
+    ]
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[np.ndarray] = None,
+        solver_order: int = 2,
+        predict_epsilon: bool = True,
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+        lower_order_final: bool = True,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.from_numpy(trained_betas)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = (
+                torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+            )
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # settings for DPM-Solver
+        if algorithm_type not in ["dpmsolver", "dpmsolver++"]:
+            raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
+        if solver_type not in ["midpoint", "heun"]:
+            raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
+
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.model_outputs = [None] * solver_order
+        self.lower_order_nums = 0
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, optional):
+                the device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+        timesteps = (
+            np.linspace(0, self.num_train_timesteps - 1, num_inference_steps + 1)
+            .round()[::-1][:-1]
+            .copy()
+            .astype(np.int64)
+        )
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
+        self.lower_order_nums = 0
+
+    def convert_model_output(
+        self, model_output: torch.FloatTensor, timestep: int, sample: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        """
+        Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs.
+
+        DPM-Solver is designed to discretize an integral of the noise prediciton model, and DPM-Solver++ is designed to
+        discretize an integral of the data prediction model. So we need to first convert the model output to the
+        corresponding type to match the algorithm.
+
+        Note that the algorithm type and the model type is decoupled. That is to say, we can use either DPM-Solver or
+        DPM-Solver++ for both noise prediction model and data prediction model.
+
+        Args:
+            model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+
+        Returns:
+            `torch.FloatTensor`: the converted model output.
+        """
+        # DPM-Solver++ needs to solve an integral of the data prediction model.
+        if self.config.algorithm_type == "dpmsolver++":
+            if self.config.predict_epsilon:
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            else:
+                x0_pred = model_output
+            if self.config.thresholding:
+                # Dynamic thresholding in https://arxiv.org/abs/2205.11487
+                dynamic_max_val = torch.quantile(
+                    torch.abs(x0_pred).reshape((x0_pred.shape[0], -1)), self.config.dynamic_thresholding_ratio, dim=1
+                )
+                dynamic_max_val = torch.maximum(
+                    dynamic_max_val,
+                    self.config.sample_max_value * torch.ones_like(dynamic_max_val).to(dynamic_max_val.device),
+                )[(...,) + (None,) * (x0_pred.ndim - 1)]
+                x0_pred = torch.clamp(x0_pred, -dynamic_max_val, dynamic_max_val) / dynamic_max_val
+            return x0_pred
+        # DPM-Solver needs to solve an integral of the noise prediction model.
+        elif self.config.algorithm_type == "dpmsolver":
+            if self.config.predict_epsilon:
+                return model_output
+            else:
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+                return epsilon
+
+    def dpm_solver_first_order_update(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        prev_timestep: int,
+        sample: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        """
+        One step for the first-order DPM-Solver (equivalent to DDIM).
+
+        See https://arxiv.org/abs/2206.00927 for the detailed derivation.
+
+        Args:
+            model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+
+        Returns:
+            `torch.FloatTensor`: the sample tensor at the previous timestep.
+        """
+        lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep]
+        alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep]
+        sigma_t, sigma_s = self.sigma_t[prev_timestep], self.sigma_t[timestep]
+        h = lambda_t - lambda_s
+        if self.config.algorithm_type == "dpmsolver++":
+            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (torch.exp(-h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "dpmsolver":
+            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (torch.exp(h) - 1.0)) * model_output
+        return x_t
+
+    def multistep_dpm_solver_second_order_update(
+        self,
+        model_output_list: List[torch.FloatTensor],
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        """
+        One step for the second-order multistep DPM-Solver.
+
+        Args:
+            model_output_list (`List[torch.FloatTensor]`):
+                direct outputs from learned diffusion model at current and latter timesteps.
+            timestep (`int`): current and latter discrete timestep in the diffusion chain.
+            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+
+        Returns:
+            `torch.FloatTensor`: the sample tensor at the previous timestep.
+        """
+        t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+        lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1]
+        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    - 0.5 * (alpha_t * (torch.exp(-h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - 0.5 * (sigma_t * (torch.exp(h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                )
+        return x_t
+
+    def multistep_dpm_solver_third_order_update(
+        self,
+        model_output_list: List[torch.FloatTensor],
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        """
+        One step for the third-order multistep DPM-Solver.
+
+        Args:
+            model_output_list (`List[torch.FloatTensor]`):
+                direct outputs from learned diffusion model at current and latter timesteps.
+            timestep (`int`): current and latter discrete timestep in the diffusion chain.
+            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+
+        Returns:
+            `torch.FloatTensor`: the sample tensor at the previous timestep.
+        """
+        t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[-2], timestep_list[-3]
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+        lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
+            self.lambda_t[t],
+            self.lambda_t[s0],
+            self.lambda_t[s1],
+            self.lambda_t[s2],
+        )
+        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+        h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
+        r0, r1 = h_0 / h, h_1 / h
+        D0 = m0
+        D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
+        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (sigma_t / sigma_s0) * sample
+                - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                - (alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+            )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (alpha_t / alpha_s0) * sample
+                - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                - (sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+            )
+        return x_t
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Step function propagating the sample with the multistep DPM-Solver.
+
+        Args:
+            model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+            return_dict (`bool`): option for returning tuple rather than SchedulerOutput class
+
+        Returns:
+            [`~scheduling_utils.SchedulerOutput`] or `tuple`: [`~scheduling_utils.SchedulerOutput`] if `return_dict` is
+            True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.to(self.timesteps.device)
+        step_index = (self.timesteps == timestep).nonzero()
+        if len(step_index) == 0:
+            step_index = len(self.timesteps) - 1
+        else:
+            step_index = step_index.item()
+        prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
+        lower_order_final = (
+            (step_index == len(self.timesteps) - 1) and self.config.lower_order_final and len(self.timesteps) < 15
+        )
+        lower_order_second = (
+            (step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
+        )
+
+        model_output = self.convert_model_output(model_output, timestep, sample)
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+        self.model_outputs[-1] = model_output
+
+        if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+            prev_sample = self.dpm_solver_first_order_update(model_output, timestep, prev_timestep, sample)
+        elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
+            timestep_list = [self.timesteps[step_index - 1], timestep]
+            prev_sample = self.multistep_dpm_solver_second_order_update(
+                self.model_outputs, timestep_list, prev_timestep, sample
+            )
+        else:
+            timestep_list = [self.timesteps[step_index - 2], self.timesteps[step_index - 1], timestep]
+            prev_sample = self.multistep_dpm_solver_third_order_update(
+                self.model_outputs, timestep_list, prev_timestep, sample
+            )
+
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`): input sample
+
+        Returns:
+            `torch.FloatTensor`: scaled input sample
+        """
+        return sample
+
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
new file mode 100644
index 000000000000..c9a6d1cd5c0b
--- /dev/null
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
@@ -0,0 +1,590 @@
+# Copyright 2022 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import flax
+import jax
+import jax.numpy as jnp
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps: int, max_beta=0.999) -> jnp.ndarray:
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+
+    Returns:
+        betas (`jnp.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+
+    def alpha_bar(time_step):
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return jnp.array(betas, dtype=jnp.float32)
+
+
+@flax.struct.dataclass
+class DPMSolverMultistepSchedulerState:
+    # setable values
+    num_inference_steps: Optional[int] = None
+    timesteps: Optional[jnp.ndarray] = None
+
+    # running values
+    model_outputs: Optional[jnp.ndarray] = None
+    lower_order_nums: Optional[int] = None
+    step_index: Optional[int] = None
+    prev_timestep: Optional[int] = None
+    cur_sample: Optional[jnp.ndarray] = None
+
+    @classmethod
+    def create(cls, num_train_timesteps: int):
+        return cls(timesteps=jnp.arange(0, num_train_timesteps)[::-1])
+
+
+@dataclass
+class FlaxDPMSolverMultistepSchedulerOutput(FlaxSchedulerOutput):
+    state: DPMSolverMultistepSchedulerState
+
+
+class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
+    """
+    DPM-Solver (and the improved version DPM-Solver++) is a fast dedicated high-order solver for diffusion ODEs with
+    the convergence order guarantee. Empirically, sampling by DPM-Solver with only 20 steps can generate high-quality
+    samples, and it can generate quite good samples even in only 10 steps.
+
+    For more details, see the original paper: https://arxiv.org/abs/2206.00927 and https://arxiv.org/abs/2211.01095
+
+    Currently, we support the multistep DPM-Solver for both noise prediction models and data prediction models. We
+    recommend to use `solver_order=2` for guided sampling, and `solver_order=3` for unconditional sampling.
+
+    We also support the "dynamic thresholding" method in Imagen (https://arxiv.org/abs/2205.11487). For pixel-space
+    diffusion models, you can set both `algorithm_type="dpmsolver++"` and `thresholding=True` to use the dynamic
+    thresholding. Note that the thresholding method is unsuitable for latent-space diffusion models (such as
+    stable-diffusion).
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+    [`~ConfigMixin.from_config`] functions.
+
+    For more details, see the original paper: https://arxiv.org/abs/2206.00927 and https://arxiv.org/abs/2211.01095
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        solver_order (`int`, default `2`):
+            the order of DPM-Solver; can be `1` or `2` or `3`. We recommend to use `solver_order=2` for guided
+            sampling, and `solver_order=3` for unconditional sampling.
+        predict_epsilon (`bool`, default `True`):
+            we currently support both the noise prediction model and the data prediction model. If the model predicts
+            the noise / epsilon, set `predict_epsilon` to `True`. If the model predicts the data / x0 directly, set
+            `predict_epsilon` to `False`.
+        thresholding (`bool`, default `False`):
+            whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
+            For pixel-space diffusion models, you can set both `algorithm_type=dpmsolver++` and `thresholding=True` to
+            use the dynamic thresholding. Note that the thresholding method is unsuitable for latent-space diffusion
+            models (such as stable-diffusion).
+        dynamic_thresholding_ratio (`float`, default `0.995`):
+            the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
+            (https://arxiv.org/abs/2205.11487).
+        sample_max_value (`float`, default `1.0`):
+            the threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+            `algorithm_type="dpmsolver++`.
+        algorithm_type (`str`, default `dpmsolver++`):
+            the algorithm type for the solver. Either `dpmsolver` or `dpmsolver++`. The `dpmsolver` type implements the
+            algorithms in https://arxiv.org/abs/2206.00927, and the `dpmsolver++` type implements the algorithms in
+            https://arxiv.org/abs/2211.01095. We recommend to use `dpmsolver++` with `solver_order=2` for guided
+            sampling (e.g. stable-diffusion).
+        solver_type (`str`, default `midpoint`):
+            the solver type for the second-order solver. Either `midpoint` or `heun`. The solver type slightly affects
+            the sample quality, especially for small number of steps. We empirically find that `midpoint` solvers are
+            slightly better, so we recommend to use the `midpoint` type.
+        lower_order_final (`bool`, default `True`):
+            whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. We empirically
+            find this trick can stabilize the sampling of DPM-Solver for steps < 15, especially for steps <= 10.
+
+    """
+
+    @property
+    def has_state(self):
+        return True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[jnp.ndarray] = None,
+        solver_order: int = 2,
+        predict_epsilon: bool = True,
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+        lower_order_final: bool = True,
+    ):
+        if trained_betas is not None:
+            self.betas = jnp.asarray(trained_betas)
+        elif beta_schedule == "linear":
+            self.betas = jnp.linspace(beta_start, beta_end, num_train_timesteps, dtype=jnp.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = jnp.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=jnp.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = jnp.cumprod(self.alphas, axis=0)
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = jnp.sqrt(self.alphas_cumprod)
+        self.sigma_t = jnp.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = jnp.log(self.alpha_t) - jnp.log(self.sigma_t)
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # settings for DPM-Solver
+        if algorithm_type not in ["dpmsolver", "dpmsolver++"]:
+            raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
+        if solver_type not in ["midpoint", "heun"]:
+            raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
+
+    def create_state(self):
+        return DPMSolverMultistepSchedulerState.create(num_train_timesteps=self.config.num_train_timesteps)
+
+    def set_timesteps(
+        self, state: DPMSolverMultistepSchedulerState, num_inference_steps: int, shape: Tuple
+    ) -> DPMSolverMultistepSchedulerState:
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            state (`DPMSolverMultistepSchedulerState`):
+                the `FlaxDPMSolverMultistepScheduler` state data class instance.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+            shape (`Tuple`):
+                the shape of the samples to be generated.
+        """
+        timesteps = (
+            jnp.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
+            .round()[::-1][:-1]
+            .astype(jnp.int32)
+        )
+
+        return state.replace(
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+            model_outputs=jnp.zeros((self.config.solver_order,) + shape),
+            lower_order_nums=0,
+            step_index=0,
+            prev_timestep=-1,
+            cur_sample=jnp.zeros(shape),
+        )
+
+    def convert_model_output(
+        self,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+    ) -> jnp.ndarray:
+        """
+        Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs.
+
+        DPM-Solver is designed to discretize an integral of the noise prediciton model, and DPM-Solver++ is designed to
+        discretize an integral of the data prediction model. So we need to first convert the model output to the
+        corresponding type to match the algorithm.
+
+        Note that the algorithm type and the model type is decoupled. That is to say, we can use either DPM-Solver or
+        DPM-Solver++ for both noise prediction model and data prediction model.
+
+        Args:
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+
+        Returns:
+            `jnp.ndarray`: the converted model output.
+        """
+        # DPM-Solver++ needs to solve an integral of the data prediction model.
+        if self.config.algorithm_type == "dpmsolver++":
+            if self.config.predict_epsilon:
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            else:
+                x0_pred = model_output
+            if self.config.thresholding:
+                # Dynamic thresholding in https://arxiv.org/abs/2205.11487
+                dynamic_max_val = jnp.percentile(
+                    jnp.abs(x0_pred), self.config.dynamic_thresholding_ratio, axis=tuple(range(1, x0_pred.ndim))
+                )
+                dynamic_max_val = jnp.maximum(
+                    dynamic_max_val, self.config.sample_max_value * jnp.ones_like(dynamic_max_val)
+                )
+                x0_pred = jnp.clip(x0_pred, -dynamic_max_val, dynamic_max_val) / dynamic_max_val
+            return x0_pred
+        # DPM-Solver needs to solve an integral of the noise prediction model.
+        elif self.config.algorithm_type == "dpmsolver":
+            if self.config.predict_epsilon:
+                return model_output
+            else:
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+                return epsilon
+
+    def dpm_solver_first_order_update(
+        self, model_output: jnp.ndarray, timestep: int, prev_timestep: int, sample: jnp.ndarray
+    ) -> jnp.ndarray:
+        """
+        One step for the first-order DPM-Solver (equivalent to DDIM).
+
+        See https://arxiv.org/abs/2206.00927 for the detailed derivation.
+
+        Args:
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+
+        Returns:
+            `jnp.ndarray`: the sample tensor at the previous timestep.
+        """
+        t, s0 = prev_timestep, timestep
+        m0 = model_output
+        lambda_t, lambda_s = self.lambda_t[t], self.lambda_t[s0]
+        alpha_t, alpha_s = self.alpha_t[t], self.alpha_t[s0]
+        sigma_t, sigma_s = self.sigma_t[t], self.sigma_t[s0]
+        h = lambda_t - lambda_s
+        if self.config.algorithm_type == "dpmsolver++":
+            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (jnp.exp(-h) - 1.0)) * m0
+        elif self.config.algorithm_type == "dpmsolver":
+            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (jnp.exp(h) - 1.0)) * m0
+        return x_t
+
+    def multistep_dpm_solver_second_order_update(
+        self,
+        model_output_list: jnp.ndarray,
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: jnp.ndarray,
+    ) -> jnp.ndarray:
+        """
+        One step for the second-order multistep DPM-Solver.
+
+        Args:
+            model_output_list (`List[jnp.ndarray]`):
+                direct outputs from learned diffusion model at current and latter timesteps.
+            timestep (`int`): current and latter discrete timestep in the diffusion chain.
+            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+
+        Returns:
+            `jnp.ndarray`: the sample tensor at the previous timestep.
+        """
+        t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+        lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1]
+        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (jnp.exp(-h) - 1.0)) * D0
+                    - 0.5 * (alpha_t * (jnp.exp(-h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (jnp.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((jnp.exp(-h) - 1.0) / h + 1.0)) * D1
+                )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (jnp.exp(h) - 1.0)) * D0
+                    - 0.5 * (sigma_t * (jnp.exp(h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (jnp.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((jnp.exp(h) - 1.0) / h - 1.0)) * D1
+                )
+        return x_t
+
+    def multistep_dpm_solver_third_order_update(
+        self,
+        model_output_list: jnp.ndarray,
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: jnp.ndarray,
+    ) -> jnp.ndarray:
+        """
+        One step for the third-order multistep DPM-Solver.
+
+        Args:
+            model_output_list (`List[jnp.ndarray]`):
+                direct outputs from learned diffusion model at current and latter timesteps.
+            timestep (`int`): current and latter discrete timestep in the diffusion chain.
+            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+
+        Returns:
+            `jnp.ndarray`: the sample tensor at the previous timestep.
+        """
+        t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[-2], timestep_list[-3]
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+        lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
+            self.lambda_t[t],
+            self.lambda_t[s0],
+            self.lambda_t[s1],
+            self.lambda_t[s2],
+        )
+        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+        h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
+        r0, r1 = h_0 / h, h_1 / h
+        D0 = m0
+        D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
+        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (sigma_t / sigma_s0) * sample
+                - (alpha_t * (jnp.exp(-h) - 1.0)) * D0
+                + (alpha_t * ((jnp.exp(-h) - 1.0) / h + 1.0)) * D1
+                - (alpha_t * ((jnp.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+            )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (alpha_t / alpha_s0) * sample
+                - (sigma_t * (jnp.exp(h) - 1.0)) * D0
+                - (sigma_t * ((jnp.exp(h) - 1.0) / h - 1.0)) * D1
+                - (sigma_t * ((jnp.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+            )
+        return x_t
+
+    def step(
+        self,
+        state: DPMSolverMultistepSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+        return_dict: bool = True,
+    ) -> Union[FlaxDPMSolverMultistepSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by DPM-Solver. Core function to propagate the diffusion process
+        from the learned model outputs (most often the predicted noise).
+
+        Args:
+            state (`DPMSolverMultistepSchedulerState`):
+                the `FlaxDPMSolverMultistepScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            return_dict (`bool`): option for returning tuple rather than FlaxDPMSolverMultistepSchedulerOutput class
+
+        Returns:
+            [`FlaxDPMSolverMultistepSchedulerOutput`] or `tuple`: [`FlaxDPMSolverMultistepSchedulerOutput`] if
+            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+        prev_timestep = jax.lax.cond(
+            state.step_index == len(state.timesteps) - 1,
+            lambda _: 0,
+            lambda _: state.timesteps[state.step_index + 1],
+            (),
+        )
+
+        model_output = self.convert_model_output(model_output, timestep, sample)
+
+        model_outputs_new = jnp.roll(state.model_outputs, -1, axis=0)
+        model_outputs_new = model_outputs_new.at[-1].set(model_output)
+        state = state.replace(
+            model_outputs=model_outputs_new,
+            prev_timestep=prev_timestep,
+            cur_sample=sample,
+        )
+
+        def step_1(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
+            return self.dpm_solver_first_order_update(
+                state.model_outputs[-1],
+                state.timesteps[state.step_index],
+                state.prev_timestep,
+                state.cur_sample,
+            )
+
+        def step_23(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
+            def step_2(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
+                timestep_list = jnp.array([state.timesteps[state.step_index - 1], state.timesteps[state.step_index]])
+                return self.multistep_dpm_solver_second_order_update(
+                    state.model_outputs,
+                    timestep_list,
+                    state.prev_timestep,
+                    state.cur_sample,
+                )
+
+            def step_3(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
+                timestep_list = jnp.array(
+                    [
+                        state.timesteps[state.step_index - 2],
+                        state.timesteps[state.step_index - 1],
+                        state.timesteps[state.step_index],
+                    ]
+                )
+                return self.multistep_dpm_solver_third_order_update(
+                    state.model_outputs,
+                    timestep_list,
+                    state.prev_timestep,
+                    state.cur_sample,
+                )
+
+            if self.config.solver_order == 2:
+                return step_2(state)
+            elif self.config.lower_order_final and len(state.timesteps) < 15:
+                return jax.lax.cond(
+                    state.lower_order_nums < 2,
+                    step_2,
+                    lambda state: jax.lax.cond(
+                        state.step_index == len(state.timesteps) - 2,
+                        step_2,
+                        step_3,
+                        state,
+                    ),
+                    state,
+                )
+            else:
+                return jax.lax.cond(
+                    state.lower_order_nums < 2,
+                    step_2,
+                    step_3,
+                    state,
+                )
+
+        if self.config.solver_order == 1:
+            prev_sample = step_1(state)
+        elif self.config.lower_order_final and len(state.timesteps) < 15:
+            prev_sample = jax.lax.cond(
+                state.lower_order_nums < 1,
+                step_1,
+                lambda state: jax.lax.cond(
+                    state.step_index == len(state.timesteps) - 1,
+                    step_1,
+                    step_23,
+                    state,
+                ),
+                state,
+            )
+        else:
+            prev_sample = jax.lax.cond(
+                state.lower_order_nums < 1,
+                step_1,
+                step_23,
+                state,
+            )
+
+        state = state.replace(
+            lower_order_nums=jnp.minimum(state.lower_order_nums + 1, self.config.solver_order),
+            step_index=(state.step_index + 1),
+        )
+
+        if not return_dict:
+            return (prev_sample, state)
+
+        return FlaxDPMSolverMultistepSchedulerOutput(prev_sample=prev_sample, state=state)
+
+    def scale_model_input(
+        self, state: DPMSolverMultistepSchedulerState, sample: jnp.ndarray, timestep: Optional[int] = None
+    ) -> jnp.ndarray:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            state (`DPMSolverMultistepSchedulerState`):
+                the `FlaxDPMSolverMultistepScheduler` state data class instance.
+            sample (`jnp.ndarray`): input sample
+            timestep (`int`, optional): current timestep
+
+        Returns:
+            `jnp.ndarray`: scaled input sample
+        """
+        return sample
+
+    def add_noise(
+        self,
+        original_samples: jnp.ndarray,
+        noise: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ) -> jnp.ndarray:
+        sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        sqrt_alpha_prod = broadcast_to_shape_from_left(sqrt_alpha_prod, original_samples.shape)
+
+        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.0
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        sqrt_one_minus_alpha_prod = broadcast_to_shape_from_left(sqrt_one_minus_alpha_prod, original_samples.shape)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
index fe45b3d591f5..7f44067325cf 100644
--- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -73,6 +73,7 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
         "LMSDiscreteScheduler",
         "PNDMScheduler",
         "EulerDiscreteScheduler",
+        "DPMSolverMultistepScheduler",
     ]
 
     @register_to_config
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 0cb31a451272..50a1bd89f839 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -74,6 +74,7 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
         "LMSDiscreteScheduler",
         "PNDMScheduler",
         "EulerAncestralDiscreteScheduler",
+        "DPMSolverMultistepScheduler",
     ]
 
     @register_to_config
diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py
index 8d633267c607..d636fe6fe87f 100644
--- a/src/diffusers/schedulers/scheduling_lms_discrete.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete.py
@@ -73,6 +73,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
         "PNDMScheduler",
         "EulerDiscreteScheduler",
         "EulerAncestralDiscreteScheduler",
+        "DPMSolverMultistepScheduler",
     ]
 
     @register_to_config
diff --git a/src/diffusers/schedulers/scheduling_pndm.py b/src/diffusers/schedulers/scheduling_pndm.py
index 0082ede787b8..eec18af8d382 100644
--- a/src/diffusers/schedulers/scheduling_pndm.py
+++ b/src/diffusers/schedulers/scheduling_pndm.py
@@ -94,6 +94,7 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin):
         "LMSDiscreteScheduler",
         "EulerDiscreteScheduler",
         "EulerAncestralDiscreteScheduler",
+        "DPMSolverMultistepScheduler",
     ]
 
     @register_to_config
diff --git a/src/diffusers/utils/dummy_flax_objects.py b/src/diffusers/utils/dummy_flax_objects.py
index 708022d85b31..8e308bb41bea 100644
--- a/src/diffusers/utils/dummy_flax_objects.py
+++ b/src/diffusers/utils/dummy_flax_objects.py
@@ -94,6 +94,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["flax"])
 
 
+class FlaxDPMSolverMultistepScheduler(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
 class FlaxKarrasVeScheduler(metaclass=DummyObject):
     _backends = ["flax"]
 
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 25aa82d6c5b2..9d296d29977d 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -302,6 +302,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class DPMSolverMultistepScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class EulerAncestralDiscreteScheduler(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/test_config.py b/tests/test_config.py
index 7a9f270af364..5084769def41 100755
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -19,7 +19,14 @@
 import unittest
 
 import diffusers
-from diffusers import DDIMScheduler, EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, PNDMScheduler, logging
+from diffusers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    PNDMScheduler,
+    logging,
+)
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.utils.testing_utils import CaptureLogger
 
@@ -283,3 +290,15 @@ def test_load_pndm(self):
         assert pndm.__class__ == PNDMScheduler
         # no warning should be thrown
         assert cap_logger.out == ""
+
+    def test_load_dpmsolver(self):
+        logger = logging.get_logger("diffusers.configuration_utils")
+
+        with CaptureLogger(logger) as cap_logger:
+            dpm = DPMSolverMultistepScheduler.from_config(
+                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
+            )
+
+        assert dpm.__class__ == DPMSolverMultistepScheduler
+        # no warning should be thrown
+        assert cap_logger.out == ""
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 29186aaac99b..056f723835ba 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -24,6 +24,7 @@
 from diffusers import (
     DDIMScheduler,
     DDPMScheduler,
+    DPMSolverMultistepScheduler,
     EulerAncestralDiscreteScheduler,
     EulerDiscreteScheduler,
     IPNDMScheduler,
@@ -549,6 +550,187 @@ def test_full_loop_with_no_set_alpha_to_one(self):
         assert abs(result_mean.item() - 0.1941) < 1e-3
 
 
+class DPMSolverMultistepSchedulerTest(SchedulerCommonTest):
+    scheduler_classes = (DPMSolverMultistepScheduler,)
+    forward_default_kwargs = (("num_inference_steps", 25),)
+
+    def get_scheduler_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "beta_start": 0.0001,
+            "beta_end": 0.02,
+            "beta_schedule": "linear",
+            "solver_order": 2,
+            "predict_epsilon": True,
+            "thresholding": False,
+            "sample_max_value": 1.0,
+            "algorithm_type": "dpmsolver++",
+            "solver_type": "midpoint",
+            "lower_order_final": False,
+        }
+
+        config.update(**kwargs)
+        return config
+
+    def check_over_configs(self, time_step=0, **config):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config(**config)
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+            # copy over dummy past residuals
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_config(tmpdirname)
+                new_scheduler.set_timesteps(num_inference_steps)
+                # copy over dummy past residuals
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
+
+            output, new_output = sample, sample
+            for t in range(time_step, time_step + scheduler.config.solver_order + 1):
+                output = scheduler.step(residual, t, output, **kwargs).prev_sample
+                new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
+
+                assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def test_from_pretrained_save_pretrained(self):
+        pass
+
+    def check_over_forward(self, time_step=0, **forward_kwargs):
+        kwargs = dict(self.forward_default_kwargs)
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+        sample = self.dummy_sample
+        residual = 0.1 * sample
+        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+            scheduler.set_timesteps(num_inference_steps)
+
+            # copy over dummy past residuals (must be after setting timesteps)
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_config(tmpdirname)
+                new_scheduler = scheduler_class.from_config(tmpdirname)
+                # copy over dummy past residuals
+                new_scheduler.set_timesteps(num_inference_steps)
+
+                # copy over dummy past residual (must be after setting timesteps)
+                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
+
+            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
+
+            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
+
+    def full_loop(self, **config):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config(**config)
+        scheduler = scheduler_class(**scheduler_config)
+
+        num_inference_steps = 10
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter
+        scheduler.set_timesteps(num_inference_steps)
+
+        for i, t in enumerate(scheduler.timesteps):
+            residual = model(sample, t)
+            sample = scheduler.step(residual, t, sample).prev_sample
+
+        return sample
+
+    def test_step_shape(self):
+        kwargs = dict(self.forward_default_kwargs)
+
+        num_inference_steps = kwargs.pop("num_inference_steps", None)
+
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+            scheduler = scheduler_class(**scheduler_config)
+
+            sample = self.dummy_sample
+            residual = 0.1 * sample
+
+            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
+                scheduler.set_timesteps(num_inference_steps)
+            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
+                kwargs["num_inference_steps"] = num_inference_steps
+
+            # copy over dummy past residuals (must be done after set_timesteps)
+            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
+            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
+
+            time_step_0 = scheduler.timesteps[5]
+            time_step_1 = scheduler.timesteps[6]
+
+            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
+            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
+
+            self.assertEqual(output_0.shape, sample.shape)
+            self.assertEqual(output_0.shape, output_1.shape)
+
+    def test_timesteps(self):
+        for timesteps in [25, 50, 100, 999, 1000]:
+            self.check_over_configs(num_train_timesteps=timesteps)
+
+    def test_thresholding(self):
+        self.check_over_configs(thresholding=False)
+        for order in [1, 2, 3]:
+            for solver_type in ["midpoint", "heun"]:
+                for threshold in [0.5, 1.0, 2.0]:
+                    for predict_epsilon in [True, False]:
+                        self.check_over_configs(
+                            thresholding=True,
+                            predict_epsilon=predict_epsilon,
+                            sample_max_value=threshold,
+                            algorithm_type="dpmsolver++",
+                            solver_order=order,
+                            solver_type=solver_type,
+                        )
+
+    def test_solver_order_and_type(self):
+        for algorithm_type in ["dpmsolver", "dpmsolver++"]:
+            for solver_type in ["midpoint", "heun"]:
+                for order in [1, 2, 3]:
+                    for predict_epsilon in [True, False]:
+                        self.check_over_configs(
+                            solver_order=order,
+                            solver_type=solver_type,
+                            predict_epsilon=predict_epsilon,
+                            algorithm_type=algorithm_type,
+                        )
+                        sample = self.full_loop(
+                            solver_order=order,
+                            solver_type=solver_type,
+                            predict_epsilon=predict_epsilon,
+                            algorithm_type=algorithm_type,
+                        )
+                        assert not torch.isnan(sample).any(), "Samples have nan numbers"
+
+    def test_lower_order_final(self):
+        self.check_over_configs(lower_order_final=True)
+        self.check_over_configs(lower_order_final=False)
+
+    def test_inference_steps(self):
+        for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
+            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
+
+    def test_full_loop_no_noise(self):
+        sample = self.full_loop()
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_mean.item() - 0.3301) < 1e-3
+
+
 class PNDMSchedulerTest(SchedulerCommonTest):
     scheduler_classes = (PNDMScheduler,)
     forward_default_kwargs = (("num_inference_steps", 50),)

From e86a280c455130d597e352a6fe90367b14bfe925 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Mon, 7 Nov 2022 12:27:17 +0100
Subject: [PATCH 138/269] Remove warning about half precision on MPS (#1163)

Remove warning about half precision on MPS.
---
 src/diffusers/pipeline_utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 97e196e72338..a708d0cfb598 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -209,13 +209,13 @@ def to(self, torch_device: Optional[Union[str, torch.device]] = None):
         for name in module_names.keys():
             module = getattr(self, name)
             if isinstance(module, torch.nn.Module):
-                if module.dtype == torch.float16 and str(torch_device) in ["cpu", "mps"]:
+                if module.dtype == torch.float16 and str(torch_device) in ["cpu"]:
                     logger.warning(
-                        "Pipelines loaded with `torch_dtype=torch.float16` cannot run with `cpu` or `mps` device. It"
-                        " is not recommended to move them to `cpu` or `mps` as running them will fail. Please make"
-                        " sure to use a `cuda` device to run the pipeline in inference. due to the lack of support for"
-                        " `float16` operations on those devices in PyTorch. Please remove the"
-                        " `torch_dtype=torch.float16` argument, or use a `cuda` device to run inference."
+                        "Pipelines loaded with `torch_dtype=torch.float16` cannot run with `cpu` device. It"
+                        " is not recommended to move them to `cpu` as running them will fail. Please make"
+                        " sure to use an accelerator to run the pipeline in inference, due to the lack of"
+                        " support for`float16` operations on this device in PyTorch. Please, remove the"
+                        " `torch_dtype=torch.float16` argument, or use another device for inference."
                     )
                 module.to(torch_device)
         return self

From cd502b25cf0debac6f98d27a6638ef95208d1ea2 Mon Sep 17 00:00:00 2001
From: "Duong A. Nguyen" <38061659+duongna21@users.noreply.github.com>
Date: Mon, 7 Nov 2022 19:34:45 +0700
Subject: [PATCH 139/269] Fix typo latens -> latents (#1171)

fix typo
---
 examples/community/composable_stable_diffusion.py               | 2 +-
 examples/community/imagic_stable_diffusion.py                   | 2 +-
 examples/community/interpolate_stable_diffusion.py              | 2 +-
 examples/community/lpw_stable_diffusion.py                      | 2 +-
 examples/community/seed_resize_stable_diffusion.py              | 2 +-
 examples/community/stable_diffusion_mega.py                     | 2 +-
 examples/community/wildcard_stable_diffusion.py                 | 2 +-
 .../pipelines/latent_diffusion/pipeline_latent_diffusion.py     | 2 +-
 .../latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py | 2 +-
 .../pipelines/stable_diffusion/pipeline_cycle_diffusion.py      | 2 +-
 .../stable_diffusion/pipeline_flax_stable_diffusion.py          | 2 +-
 .../stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py  | 2 +-
 .../stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py  | 2 +-
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py     | 2 +-
 .../stable_diffusion/pipeline_stable_diffusion_img2img.py       | 2 +-
 .../stable_diffusion/pipeline_stable_diffusion_inpaint.py       | 2 +-
 .../pipeline_stable_diffusion_inpaint_legacy.py                 | 2 +-
 17 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/examples/community/composable_stable_diffusion.py b/examples/community/composable_stable_diffusion.py
index 10d34d255a20..eb207e1bdd47 100644
--- a/examples/community/composable_stable_diffusion.py
+++ b/examples/community/composable_stable_diffusion.py
@@ -32,7 +32,7 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline):
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offsensive or harmful.
diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py
index 92aa677b4626..0c95fb435848 100644
--- a/examples/community/imagic_stable_diffusion.py
+++ b/examples/community/imagic_stable_diffusion.py
@@ -54,7 +54,7 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offsensive or harmful.
diff --git a/examples/community/interpolate_stable_diffusion.py b/examples/community/interpolate_stable_diffusion.py
index de1c6f687aa8..761aaeca69c1 100644
--- a/examples/community/interpolate_stable_diffusion.py
+++ b/examples/community/interpolate_stable_diffusion.py
@@ -65,7 +65,7 @@ class StableDiffusionWalkPipeline(DiffusionPipeline):
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py
index 39a266dec972..e4ee7bf3c6fa 100644
--- a/examples/community/lpw_stable_diffusion.py
+++ b/examples/community/lpw_stable_diffusion.py
@@ -398,7 +398,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
diff --git a/examples/community/seed_resize_stable_diffusion.py b/examples/community/seed_resize_stable_diffusion.py
index f912663a687d..92cd1c04f9f3 100644
--- a/examples/community/seed_resize_stable_diffusion.py
+++ b/examples/community/seed_resize_stable_diffusion.py
@@ -37,7 +37,7 @@ class SeedResizeStableDiffusionPipeline(DiffusionPipeline):
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
diff --git a/examples/community/stable_diffusion_mega.py b/examples/community/stable_diffusion_mega.py
index 723951941576..67112b282b67 100644
--- a/examples/community/stable_diffusion_mega.py
+++ b/examples/community/stable_diffusion_mega.py
@@ -42,7 +42,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionMegaSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
diff --git a/examples/community/wildcard_stable_diffusion.py b/examples/community/wildcard_stable_diffusion.py
index b0f6375d50c6..9ad0d8e9fa04 100644
--- a/examples/community/wildcard_stable_diffusion.py
+++ b/examples/community/wildcard_stable_diffusion.py
@@ -99,7 +99,7 @@ class WildcardStableDiffusionPipeline(DiffusionPipeline):
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
index d8948862845d..cfa39949130e 100644
--- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
@@ -32,7 +32,7 @@ class LDMTextToImagePipeline(DiffusionPipeline):
             [BertTokenizer](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer).
         unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
     """
 
diff --git a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
index a7ffb4adc99a..c8da6f193e15 100644
--- a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
+++ b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
@@ -18,7 +18,7 @@ class LDMPipeline(DiffusionPipeline):
             Vector-quantized (VQ) Model to encode and decode images to and from latent representations.
         unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
-            [`DDIMScheduler`] is to be used in combination with `unet` to denoise the encoded image latens.
+            [`DDIMScheduler`] is to be used in combination with `unet` to denoise the encoded image latents.
     """
 
     def __init__(self, vqvae: VQModel, unet: UNet2DModel, scheduler: DDIMScheduler):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index f38e57983f15..3d2ec7d55b98 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -107,7 +107,7 @@ class CycleDiffusionPipeline(DiffusionPipeline):
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
index 5a910f8453ce..73ec322c6646 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
@@ -47,7 +47,7 @@ class FlaxStableDiffusionPipeline(FlaxDiffusionPipeline):
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         unet ([`FlaxUNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`FlaxDDIMScheduler`], [`FlaxLMSDiscreteScheduler`], [`FlaxPNDMScheduler`], or
             [`FlaxDPMSolverMultistepScheduler`].
         safety_checker ([`FlaxStableDiffusionSafetyChecker`]):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
index 5e6b2e6f2fc3..04ecdbecc6d5 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -46,7 +46,7 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
index 2ce9831a1676..517242921d92 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -59,7 +59,7 @@ class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline):
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 094841f9778e..9c7edabf69f1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -44,7 +44,7 @@ class StableDiffusionPipeline(DiffusionPipeline):
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 08b14b36be89..f6f38ab1d376 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -55,7 +55,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 34e8231c63ee..a7af1c9d3351 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -56,7 +56,7 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 5c06b74bfa38..91dcefc91dcd 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -62,7 +62,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
         unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
         scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.

From 0dd8c6b4dbab4069de9ed1cafb53cbd495873879 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Mon, 7 Nov 2022 14:32:51 +0100
Subject: [PATCH 140/269] Fix community pipeline links (#1162)

* Change title to match the sidebar in _toctree.

* Fix custom pipe link, add link to contribute.

* Fix community pipeline links.
---
 README.md                                                | 2 +-
 docs/source/using-diffusers/contribute_pipeline.mdx      | 2 +-
 docs/source/using-diffusers/custom_pipeline_overview.mdx | 2 +-
 src/diffusers/pipeline_utils.py                          | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 9c44cff2c8e2..d3ac5702cd5f 100644
--- a/README.md
+++ b/README.md
@@ -353,7 +353,7 @@ Textual Inversion is a technique for capturing novel concepts from a small numbe
 
 ## Stable Diffusion Community Pipelines
 
-The release of Stable Diffusion as an open source model has fostered a lot of interesting ideas and experimentation. Our [Community Examples folder](https://github.com/huggingface/diffusers/tree/main/examples/community) contains many ideas worth exploring, like interpolating to create animated videos, using CLIP Guidance for additional prompt fidelity, term weighting, and much more! Take a look and [contribute your own](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipelines).
+The release of Stable Diffusion as an open source model has fostered a lot of interesting ideas and experimentation. Our [Community Examples folder](https://github.com/huggingface/diffusers/tree/main/examples/community) contains many ideas worth exploring, like interpolating to create animated videos, using CLIP Guidance for additional prompt fidelity, term weighting, and much more! [Take a look](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview) and [contribute your own](https://huggingface.co/docs/diffusers/using-diffusers/contribute_pipeline).
 
 ## Other Examples
 
diff --git a/docs/source/using-diffusers/contribute_pipeline.mdx b/docs/source/using-diffusers/contribute_pipeline.mdx
index 6122a996e1c1..18e84cdfbc94 100644
--- a/docs/source/using-diffusers/contribute_pipeline.mdx
+++ b/docs/source/using-diffusers/contribute_pipeline.mdx
@@ -128,7 +128,7 @@ pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeli
 pipe()
 ```
 
-Another way to upload your custom_pipeline, besides sending a PR, is uploading the code that contains it to the Hugging Face Hub, [as exemplified here](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipelines#loading-custom-pipelines-from-the-hub).
+Another way to upload your custom_pipeline, besides sending a PR, is uploading the code that contains it to the Hugging Face Hub, [as exemplified here](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview#loading-custom-pipelines-from-the-hub).
 
 **Try it out now - it works!**
 
diff --git a/docs/source/using-diffusers/custom_pipeline_overview.mdx b/docs/source/using-diffusers/custom_pipeline_overview.mdx
index b1e9c0ce6178..ae5bad2d7bf2 100644
--- a/docs/source/using-diffusers/custom_pipeline_overview.mdx
+++ b/docs/source/using-diffusers/custom_pipeline_overview.mdx
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
-# Loading and Saving Custom Pipelines
+# Loading and Adding Custom Pipelines
 
 Diffusers allows you to conveniently load any custom pipeline from the Hugging Face Hub as well as any [official community pipeline](https://github.com/huggingface/diffusers/tree/main/examples/community) 
 via the [`DiffusionPipeline`] class.
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index a708d0cfb598..628e6320129b 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -303,8 +303,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                         </Tip>
 
                 For more information on how to load and create custom pipelines, please have a look at [Loading and
-                Creating Custom
-                Pipelines](https://huggingface.co/docs/diffusers/main/en/using-diffusers/custom_pipelines)
+                Adding Custom
+                Pipelines](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview)
 
             torch_dtype (`str` or `torch.dtype`, *optional*):
             force_download (`bool`, *optional*, defaults to `False`):

From b500df11559265857d6b51685affdc13822f625f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 7 Nov 2022 17:15:41 +0100
Subject: [PATCH 141/269] [Docs] Add loading script (#1174)

* add loading script

* Apply suggestions from code review

* Apply suggestions from code review

Co-authored-by: Anton Lozhkov <anton@huggingface.co>
Co-authored-by: Suraj Patil <surajp815@gmail.com>

* correct

* Apply suggestions from code review

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

* uP

Co-authored-by: Anton Lozhkov <anton@huggingface.co>
Co-authored-by: Suraj Patil <surajp815@gmail.com>
Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 docs/source/using-diffusers/loading.mdx | 368 +++++++++++++++++++++++-
 1 file changed, 364 insertions(+), 4 deletions(-)

diff --git a/docs/source/using-diffusers/loading.mdx b/docs/source/using-diffusers/loading.mdx
index 35f53b566442..35f1e0f928b6 100644
--- a/docs/source/using-diffusers/loading.mdx
+++ b/docs/source/using-diffusers/loading.mdx
@@ -12,7 +12,370 @@ specific language governing permissions and limitations under the License.
 
 # Loading
 
-The core functionality for saving and loading systems in `Diffusers` is the HuggingFace Hub.
+A core premise of the diffusers library is to make diffusion models **as accessible as possible**.
+Accessibility is therefore achieved by providing an API to load complete diffusion pipelines as well as individual components with a single line of code. 
+
+In the following we explain in-detail how to easily load:
+
+- *Complete Diffusion Pipelines* via the [`DiffusionPipeline.from_pretrained`]
+- *Diffusion Models* via [`ModelMixin.from_pretrained`]
+- *Schedulers* via [`ConfigMixin.from_config`]
+
+## Loading pipelines
+
+The [`DiffusionPipeline`] class is the easiest way to access any diffusion model that is [available on the Hub](https://huggingface.co/models?library=diffusers). Let's look at an example on how to download [CompVis' Latent Diffusion model](https://huggingface.co/CompVis/ldm-text2im-large-256).
+
+```python
+from diffusers import DiffusionPipeline
+
+repo_id = "CompVis/ldm-text2im-large-256"
+ldm = DiffusionPipeline.from_pretrained(repo_id)
+```
+
+Here [`DiffusionPipeline`] automatically detects the correct pipeline (*i.e.* [`LDMTextToImagePipeline`]), downloads and caches all required configuration and weight files (if not already done so), and finally returns a pipeline instance, called `ldm`.
+The pipeline instance can then be called using [`LDMTextToImagePipeline.__call__`] (i.e., `ldm("image of a astronaut riding a horse")`) for text-to-image generation.
+
+Instead of using the generic [`DiffusionPipeline`] class for loading, you can also load the appropriate pipeline class directly. The code snippet above yields the same instance as when doing:
+
+```python
+from diffusers import LDMTextToImagePipeline
+
+repo_id = "CompVis/ldm-text2im-large-256"
+ldm = LDMTextToImagePipeline.from_pretrained(repo_id)
+```
+
+Diffusion pipelines like `LDMTextToImagePipeline` often consist of multiple components. These components can be both parameterized models, such as `"unet"`, `"vqvae"` and "bert", tokenizers or schedulers. These components can interact in complex ways with each other when using the pipeline in inference, *e.g.* for [`LDMTextToImagePipeline`] or [`StableDiffusionPipeline`] the inference call is explained [here](https://huggingface.co/blog/stable_diffusion#how-does-stable-diffusion-work).
+The purpose of the [pipeline classes](./api/overview#diffusers-summary) is to wrap the complexity of these diffusion systems and give the user an easy-to-use API while staying flexible for customization, as will be shown later.
+
+### Loading pipelines that require access request
+
+Due to the capabilities of diffusion models to generate extremely realistic images, there is a certain danger that such models might be misused for unwanted applications, *e.g.* generating pornography or violent images.
+In order to minimize the possibility of such unsolicited use cases, some of the most powerful diffusion models require users to acknowledge a license before being able to use the model. If the user does not agree to the license, the pipeline cannot be downloaded.
+If you try to load [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) the same way as done previously:
+
+```python
+from diffusers import DiffusionPipeline
+
+repo_id = "runwayml/stable-diffusion-v1-5"
+stable_diffusion = DiffusionPipeline.from_pretrained(repo_id)
+```
+
+it will only work if you have both *click-accepted* the license on [the model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) and are logged into the Hugging Face Hub. Otherwise you will get an error message
+such as the following:
+
+```
+OSError: runwayml/stable-diffusion-v1-5 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
+If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login`
+```
+
+Therefore, we need to make sure to *click-accept* the license. You can do this by simply visiting 
+the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) and clicking on "Agree and access repository":
+
+![access_request](https://github.com/patrickvonplaten/scientific_images/blob/master/access_request.png)
+
+Second, you need to login with your access token:
+
+```
+huggingface-cli login
+```
+
+before trying to load the model. Or alternatively, you can pass [your access token](https://huggingface.co/docs/hub/security-tokens#user-access-tokens) directly via the flag `use_auth_token`. In this case you do **not** need 
+to run `huggingface-cli login` before:
+
+```python
+from diffusers import DiffusionPipeline
+
+repo_id = "runwayml/stable-diffusion-v1-5"
+stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, use_auth_token="<your-access-token>")
+```
+
+The final option to use pipelines that require access without having to rely on the Hugging Face Hub is to load the pipeline locally as explained in the next section.
+
+### Loading pipelines locally
+
+If you prefer to have complete control over the pipeline and its corresponding files or, as said before, if you want to use pipelines that require an access request without having to be connected to the Hugging Face Hub, 
+we recommend loading pipelines locally.
+
+To load a diffusion pipeline locally, you first need to manually download the whole folder structure on your local disk and then pass a local path to the [`DiffusionPipeline.from_pretrained`]. Let's again look at an example for 
+[CompVis' Latent Diffusion model](https://huggingface.co/CompVis/ldm-text2im-large-256).
+
+First, you should make use of [`git-lfs`](https://git-lfs.github.com/) to download the whole folder structure that has been uploaded to the [model repository](https://huggingface.co/CompVis/ldm-text2im-large-256/tree/main):
+
+```
+git lfs install
+git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
+```
+
+The command above will create a local folder called `./stable-diffusion-v1-5` on your disk.
+Now, all you have to do is to simply pass the local folder path to `from_pretrained`:
+
+```python
+from diffusers import DiffusionPipeline
+
+repo_id = "./stable-diffusion-v1-5"
+stable_diffusion = DiffusionPipeline.from_pretrained(repo_id)
+```
+
+If `repo_id` is a local path, as it is the case here, [`DiffusionPipeline.from_pretrained`] will automatically detect it and therefore not try to download any files from the Hub.
+While we usually recommend to load weights directly from the Hub to be certain to stay up to date with the newest changes, loading pipelines locally should be preferred if one 
+wants to stay anonymous, self-contained applications, etc...
+
+### Loading customized pipelines
+
+Advanced users that want to load customized versions of diffusion pipelines can do so by swapping any of the default components, *e.g.* the scheduler, with other scheduler classes.
+A classical use case of this functionality is to swap the scheduler. [Stable Diffusion v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) uses the [`PNDMScheduler`] by default which is generally not the most performant scheduler. Since the release
+of stable diffusion, multiple improved schedulers have been published. To use those, the user has to manually load their preferred scheduler and pass it into [`DiffusionPipeline.from_pretrained`].
+
+*E.g.* to use [`EulerDiscreteScheduler`] or [`DPMSolverMultistepScheduler`] to have a better quality vs. generation speed trade-off for inference, one could load them as follows:
+
+```python
+from diffusers import DiffusionPipeline, EulerDiscreteScheduler, DPMSolverMultistepScheduler
+
+repo_id = "runwayml/stable-diffusion-v1-5"
+
+scheduler = EulerDiscreteScheduler.from_config(repo_id, subfolder="scheduler")
+# or
+# scheduler = DPMSolverMultistepScheduler.from_config(repo_id, subfolder="scheduler")
+
+stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, scheduler=scheduler)
+```
+
+Three things are worth paying attention to here.
+- First, the scheduler is loaded with [`ConfigMixin.from_config`] since it only depends on a configuration file and not any parameterized weights
+- Second, the scheduler is loaded with a function argument, called `subfolder="scheduler"` as the configuration of stable diffusion's scheduling is defined in a [subfolder of the official pipeline repository](https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main/scheduler)
+- Third, the scheduler instance can simply be passed with the `scheduler` keyword argument to [`DiffusionPipeline.from_pretrained`]. This works because the [`StableDiffusionPipeline`] defines its scheduler with the `scheduler` attribute. It's not possible to use a different name, such as `sampler=scheduler` since `sampler` is not a defined keyword for [`StableDiffusionPipeline.__init__`]
+
+Not only the scheduler components can be customized for diffusion pipelines; in theory, all components of a pipeline can be customized. In practice, however, it often only makes sense to switch out a component that has **compatible** alternatives to what the pipeline expects.
+Many scheduler classes are compatible with each other as can be seen [here](https://github.com/huggingface/diffusers/blob/0dd8c6b4dbab4069de9ed1cafb53cbd495873879/src/diffusers/schedulers/scheduling_ddim.py#L112). This is not always the case for other components, such as the `"unet"`.
+
+One special case that can also be customized is the `"safety_checker"` of stable diffusion. If you believe the safety checker doesn't serve you any good, you can simply disable it by passing `None`:
+
+```python
+from diffusers import DiffusionPipeline, EulerDiscreteScheduler, DPMSolverMultistepScheduler
+
+stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, safety_checker=None)
+```
+
+Another common use case is to reuse the same components in multiple pipelines, *e.g.* the weights and configurations of [`"runwayml/stable-diffusion-v1-5"`](https://huggingface.co/runwayml/stable-diffusion-v1-5) can be used for both [`StableDiffusionPipeline`] and [`StableDiffusionImg2ImgPipeline`] and we might not want to 
+use the exact same weights into RAM twice. In this case, customizing all the input instances would help us 
+to only load the weights into RAM once:
+
+```python
+from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
+
+model_id = "runwayml/stable-diffusion-v1-5"
+stable_diffusion_txt2img = StableDiffusionPipeline.from_pretrained(model_id)
+
+components = stable_diffusion_txt2img.components
+
+# weights are not reloaded into RAM
+stable_diffusion_img2img = StableDiffusionImg2ImgPipeline(**components)
+```
+
+Note how the above code snippet makes use of [`DiffusionPipeline.components`].
+
+### How does loading work?
+
+As a class method, [`DiffusionPipeline.from_pretrained`] is responsible for two things:
+- Download the latest version of the folder structure required to run the `repo_id` with `diffusers` and cache them. If the latest folder structure is available in the local cache, [`DiffusionPipeline.from_pretrained`] will simply reuse the cache and **not** re-download the files.
+- Load the cached weights into the _correct_ pipeline class – one of the [officially supported pipeline classes](./api/overview#diffusers-summary) - and return an instance of the class. The _correct_ pipeline class is thereby retrieved from the `model_index.json` file.
+
+The underlying folder structure of diffusion pipelines correspond 1-to-1 to their corresponding class instances, *e.g.* [`LDMTextToImagePipeline`] for [`CompVis/ldm-text2im-large-256`](https://huggingface.co/CompVis/ldm-text2im-large-256)
+This can be understood better by looking at an example. Let's print out pipeline class instance `pipeline` we just defined:
+
+```python
+from diffusers import DiffusionPipeline
+
+repo_id = "CompVis/ldm-text2im-large-256"
+ldm = DiffusionPipeline.from_pretrained(repo_id)
+print(ldm)
+```
+
+*Output*:
+```
+LDMTextToImagePipeline {
+  "bert": [
+    "latent_diffusion",
+    "LDMBertModel"
+  ],
+  "scheduler": [
+    "diffusers",
+    "DDIMScheduler"
+  ],
+  "tokenizer": [
+    "transformers",
+    "BertTokenizer"
+  ],
+  "unet": [
+    "diffusers",
+    "UNet2DConditionModel"
+  ],
+  "vqvae": [
+    "diffusers",
+    "AutoencoderKL"
+  ]
+}
+```
+
+First, we see that the official pipeline is the [`LDMTextToImagePipeline`], and second we see that the `LDMTextToImagePipeline` consists of 5 components:
+- `"bert"` of class `LDMBertModel` as defined [in the pipeline](https://github.com/huggingface/diffusers/blob/cd502b25cf0debac6f98d27a6638ef95208d1ea2/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py#L664)
+- `"scheduler"` of class [`DDIMScheduler`]
+- `"tokenizer"` of class `BertTokenizer` as defined [in `transformers`](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer)
+- `"unet"` of class [`UNet2DConditionModel`]
+- `"vqvae"` of class [`AutoencoderKL`]
+
+Let's now compare the pipeline instance to the folder structure of the model repository `CompVis/ldm-text2im-large-256`. Looking at the folder structure of [`CompVis/ldm-text2im-large-256`](https://huggingface.co/CompVis/ldm-text2im-large-256/tree/main) on the Hub, we can see it matches 1-to-1 the printed out instance of `LDMTextToImagePipeline` above:
+
+```
+.
+├── bert
+│   ├── config.json
+│   └── pytorch_model.bin
+├── model_index.json
+├── scheduler
+│   └── scheduler_config.json
+├── tokenizer
+│   ├── special_tokens_map.json
+│   ├── tokenizer_config.json
+│   └── vocab.txt
+├── unet
+│   ├── config.json
+│   └── diffusion_pytorch_model.bin
+└── vqvae
+    ├── config.json
+    └── diffusion_pytorch_model.bin
+```
+
+As we can see each attribute of the instance of `LDMTextToImagePipeline` has its configuration and possibly weights defined in a subfolder that is called **exactly** like the class attribute (`"bert"`, `"scheduler"`, `"tokenizer"`, `"unet"`, `"vqvae"`). Importantly, every pipeline expects a `model_index.json` file that tells the `DiffusionPipeline` both:
+- which pipeline class should be loaded, and
+- what sub-classes from which library are stored in which subfolders
+
+In the case of `CompVis/ldm-text2im-large-256` the `model_index.json` is therefore defined as follows:
+
+```
+{
+  "_class_name": "LDMTextToImagePipeline",
+  "_diffusers_version": "0.0.4",
+  "bert": [
+    "latent_diffusion",
+    "LDMBertModel"
+  ],
+  "scheduler": [
+    "diffusers",
+    "DDIMScheduler"
+  ],
+  "tokenizer": [
+    "transformers",
+    "BertTokenizer"
+  ],
+  "unet": [
+    "diffusers",
+    "UNet2DConditionModel"
+  ],
+  "vqvae": [
+    "diffusers",
+    "AutoencoderKL"
+  ]
+}
+```
+
+- `_class_name` tells `DiffusionPipeline` which pipeline class should be loaded. 
+- `_diffusers_version` can be useful to know under which `diffusers` version this model was created.
+- Every component of the pipeline is then defined under the form:
+```
+"name" : [
+  "library",
+  "class"
+]
+```
+	- The `"name"` field corresponds both to the name of the subfolder in which the configuration and weights are stored as well as the attribute name of the pipeline class (as can be seen [here](https://huggingface.co/CompVis/ldm-text2im-large-256/tree/main/bert) and [here](https://github.com/huggingface/diffusers/blob/cd502b25cf0debac6f98d27a6638ef95208d1ea2/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py#L42)
+	- The `"library"` field corresponds to the name of the library, *e.g.* `diffusers` or `transformers` from which the `"class"` should be loaded
+	- The `"class"` field corresponds to the name of the class, *e.g.* [`BertTokenizer`](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer) or [`UNet2DConditionModel`]
+
+
+## Loading models
+
+Models as defined under [src/diffusers/models](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models) can be loaded via the [`ModelMixin.from_pretrained`] function. The API is very similar the [`DiffusionPipeline.from_pretrained`] and works in the same way:
+- Download the latest version of the model weights and configuration with `diffusers` and cache them. If the latest files are available in the local cache, [`ModelMixin.from_pretrained`] will simply reuse the cache and **not** re-download the files.
+- Load the cached weights into the _defined_ model class - one of [the existing model classes](./api/models) - and return an instance of the class.
+
+In constrast to [`DiffusionPipeline.from_pretrained`], models rely on fewer files that usually don't require a folder structure, but just a `diffusion_pytorch_model.bin` and `config.json` file.
+
+Let's look at an example:
+
+```python
+from diffusers import UNet2DConditionModel
+
+repo_id = "CompVis/ldm-text2im-large-256"
+model = UNet2DConditionModel.from_pretrained(repo_id, subfolder="unet")
+```
+
+Note how we have to define the `subfolder="unet"` argument to tell [`ModelMixin.from_pretrained`] that the model weights are located in a [subfolder of the repository](https://huggingface.co/CompVis/ldm-text2im-large-256/tree/main/unet).
+
+As explained in [Loading customized pipelines]("./using-diffusers/loading#loading-customized-pipelines"), one can pass a loaded model to a diffusion pipeline, via [`DiffusionPipeline.from_pretrained`]:
+
+```python
+from diffusers import DiffusionPipeline
+
+repo_id = "CompVis/ldm-text2im-large-256"
+ldm = DiffusionPipeline.from_pretrained(repo_id, unet=model)
+```
+
+If the model files can be found directly at the root level, which is usually only the case for some very simple diffusion models, such as [`google/ddpm-cifar10-32`](https://huggingface.co/google/ddpm-cifar10-32), we don't 
+need to pass a `subfolder` argument:
+
+```python
+from diffusers import UNet2DModel
+
+repo_id = "google/ddpm-cifar10-32"
+model = UNet2DModel.from_pretrained(repo_id)
+```
+
+## Loading schedulers
+
+Schedulers cannot be loaded via a `from_pretrained` method, but instead rely on [`ConfigMixin.from_config`]. Schedulers are **not parameterized** or **trained**, but instead purely defined by a configuration file.
+Therefore the loading method was given a different name here.
+
+In constrast to pipelines or models, loading schedulers does not consume any significant amount of memory and the same configuration file can often be used for a variety of different schedulers.
+For example, all of:
+
+- [`DDPMScheduler`]
+- [`DDIMScheduler`]
+- [`PNDMScheduler`]
+- [`LMSDiscreteScheduler`]
+- [`EulerDiscreteScheduler`]
+- [`EulerAncestralDiscreteScheduler`]
+- [`DPMSolverMultistepScheduler`]
+
+are compatible with [`StableDiffusionPipeline`] and therefore the same scheduler configuration file can be loaded in any of those classes:
+
+```python
+from diffusers import StableDiffusionPipeline
+from diffusers import (
+    DDPMScheduler,
+    DDIMScheduler,
+    PNDMScheduler,
+    LMSDiscreteScheduler,
+    EulerDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+    DPMSolverMultistepScheduler,
+)
+
+repo_id = "runwayml/stable-diffusion-v1-5"
+
+ddpm = DDPMScheduler.from_config(repo_id, subfolder="scheduler")
+ddim = DDIMScheduler.from_config(repo_id, subfolder="scheduler")
+pndm = PNDMScheduler.from_config(repo_id, subfolder="scheduler")
+lms = LMSDiscreteScheduler.from_config(repo_id, subfolder="scheduler")
+euler_anc = EulerAncestralDiscreteScheduler.from_config(repo_id, subfolder="scheduler")
+euler = EulerDiscreteScheduler.from_config(repo_id, subfolder="scheduler")
+dpm = DPMSolverMultistepScheduler.from_config(repo_id, subfolder="scheduler")
+
+# replace `dpm` with any of `ddpm`, `ddim`, `pndm`, `lms`, `euler`, `euler_anc`
+pipeline = StableDiffusionPipeline.from_pretrained(repo_id, scheduler=dpm)
+```
+
+## API
 
 [[autodoc]] modeling_utils.ModelMixin
     - from_pretrained
@@ -29,6 +392,3 @@ The core functionality for saving and loading systems in `Diffusers` is the Hugg
 [[autodoc]] pipeline_flax_utils.FlaxDiffusionPipeline
     - from_pretrained
     - save_pretrained
-
-
-Under further construction 🚧, open a [PR](https://github.com/huggingface/diffusers/compare) if you want to contribute!

From de7536281a029167c34d30c9b5d822b19caabb69 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 7 Nov 2022 17:25:13 +0100
Subject: [PATCH 142/269] fix image docs

---
 docs/source/imgs/access_request.png     | Bin 0 -> 104814 bytes
 docs/source/using-diffusers/loading.mdx |   6 +++++-
 2 files changed, 5 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/imgs/access_request.png

diff --git a/docs/source/imgs/access_request.png b/docs/source/imgs/access_request.png
new file mode 100644
index 0000000000000000000000000000000000000000..33c6abc88dfb226e929b44c30c173c787b407045
GIT binary patch
literal 104814
zcmaI7Wk6fa_B~8n3Z)b%(Be|O6f5rVw0N-s#fn>ScTEkqBE<uxxCVEZP%JpXgL{HI
z2`{D3y}$ea@}3VT$()%zYu4=9GiTN$^u3B4-eZc#7#J9M3i8tG7#I)uF);4OKEk?9
z+4*vUfBS*y`c~n?qua~xk$K4NZ*n&o9XAa}fSad@iv@<IgQL9#hpU;3g@uEwm809<
zgI0;#OfUarl5(*yakF-Gc=5s7-U7qU-GTGPYrz)|jvkyZcwY1Hyx<lP;pGwG<9i0q
zyN7}C0z*Oi%?Gcv-FaXAcnkE+0nEwe>3AkzrY3PUkv1E<;s6h~eK&{T;ANn#UgnAY
z$fZyIpwn#4#O~r!W9UReip8*?q8{5pV}h2Rf?U4a1b$e-LuRbp-@h?nd(p|CO_EHo
zWd8A$q$Ox-T6cw?>7Th9oelV{OGI=p<@4!aJb0y9_@O{O=iQG#U%V@RkjPo&I8V;a
zsZD5V8@T`Xb~0&H`$6bcWH&?he=^^dV$s|ZQ(swk<mrB{t?&Hb+rcC`Sok(Q{lov6
z`11{woewkASivv)zd8%l*LpR!Qz-5L{EH-g&SH=QG(V^2gzh1G_P>UmyY+1K<!nCu
zFKCy<hO6Y|Ft*aV|J(w2dN`ir4`s_r|G8FIi2(O*aK8U-<G)qZ9IDIN{C%9A%fSD?
zG>8=_Dh76%r11miH2%lvAM>>>UbnzN{M!qsy8my>nyX>%%8?;|uC##;40e)z&!`mc
z{{?khzC;k|tQ?t9|G)4HDxN3O8Moih|9>b^(4B0(N)}yF`=36zt<VwW9?igy{JvdS
zRNC<0+hwl?K|9$KjDUClcEH)IqR93S+M<dYt#(_dtarqq`@@2T9H}uD^FI9BQEg<u
z`}&rDg>!byFn<xSr?(;|R7nkK$f;JQ<ZRe{TTxNT4XH{w{Yyc>Hbly4O|Ya{%w_eF
zTvU9mm#A2O)($dj&s!?FhHn4ZHxaG^@7mEh&j2`Vv}n%7d$CH<F3dFz&FZfa2+#Rm
zJz&3?)V<lRe-dyL&A|cZq_voa1nNuj1~+pYTpOasDK4{m`@+=oU*3CRekO_IZ^=$K
zOXGOqQT?UjcwXi@;PA)~Ac<ZhH~S6s`K;%Feqa=E5Xh!`Fi6&!Xyn(<A$aq7x0pSm
zz#-v8^RAOy=XeHqJIgV0e@yfvo>K{%_NeKGJ}jGMvzna@o0ws)TeJ82j+<AyVYSSM
zKAF=ASc@VBTa5m$Z;~Ry6@ad6d-P1UeEggX29<hW@lKsqFZ)c8kSw%>0BH+m<HRB2
zZPzg<dGh(sP!MIK_{ZD@Q-o*<ZI-O(Vg$6LNHVIHrh+x{yJJT1BT8nEM+x(`uPq7a
zR^zOnx|2-I%VJkIjSf1-RIfjfnAerpm6!8*LIy1%E?rEkDY5mZ)T=$A8K{HHjE-#=
zlQwe)w_bZutIp3fj`VkasB~I-OJTdz@o+9F<y}b5+LhFuFP>=B(f}d~E)FPGULEB7
zqU)4hA#Bu|R1z+jjm^JQ=9ZkC_ARkJuGD^I%@R>i%<LaJ2NX+Qu2B}Vl6BHHimv#=
zT2+v0Kk-UKqcc%;4+UFY)ip#Fc~8Js-jR(XgH8Tt2+1~5y_WUDRmPU7Yj@>216jG+
z9LqCKbYk@9;nRq;lb5wj?=1t&77yb&$J6z;kosf4R$!J*9#8phj%v3$EJ_1xkO!W>
zmUn#<Z<*=Qcj14A=scXK7+}&iD?s0G^X_8aW-sKb%h;w7yf8q=s3tn^#fgTtB5w|A
zMTc@+-3jZ3_sS5#kD*BN1sj-T{y916>&tgGr@<8&{lf3;o4NbJ0uy!%yAsQKY-os5
z1yU8FrTZI#&BAlB_C@4iFja6Y7A*O9T;A|3n~<CdTbFh;+7)Q5d+c#A7TW7wSA%y$
zKsR>q_M8XAl!g){j}R&u4vpuq9r%8a=+M2#7qA_pu}V)*k7sqvH5i(KO-JMO={i69
zCm9f%!p4|eV-9B0e&2Wl+(^Na0*T#@euE1&8%~25HW>rrH_I37*Bh3*LZeX<QV!<p
z#gqzm8`Gr?J7(n=n%KV$VtV6l8bg~NJa-fE#2iBl#1*X&RN9}oBs5*(?*(Z79{G)Y
zLM|CuUuz`q+izl+;lj^b<iJpQ1E2{IAcdg6-lTdk>s4+XwN>twoeXas(>mj0+ot~>
zeLGuw+J1BeQv7;<z=$j0oEn*NO-Ks8BF+-^X78GK-M<7X_U23l1_WRPpSW>0T@om@
z6tXQPF!-H7j?CBnWE5n`xHt4W&O>3``+P%dqUREV!NoUMNrS!w>L!sEzwxZI2|^!)
zAAP9JlhUXgMRV^3XbiMe$j36z_1Y&^c|wgEK$=5nuAA#7GL@a)HSy?iJ%aGHN3;My
zS=0Qp^h+rM*h+Wt#SxVN!3|qwP*u;tyr@|7wX(|@72VwASGC$)XM>Cyr%h_~VIDOj
zD)T`-7qBZ@Ttk<SJHa!NooVOjfsmDA^u4TlL_+wMOIJ}wVfcmJNjN*um*whJ$L8fu
zX7)mCN1(u`w}uz4%vYuIblXQ~)7Spok}}8U*9JG;KP>N4EfrM<T$YmeXm64dm_i%K
z6i-N#@KLt|*BRYfEST^+gWJ;R@meqr%3NeZ{ASVjVnWngSd?~l^9eehXydWR%Heow
zzS>$Vm_6aGVe*gO7PcVb^fQT6Puj(OYf|2eR?ej!7;Jx1P;#GGyguL^1;fzeU|AK%
z>E<BTvpv9EoJL&}hcPQ)5Jj**y+&}b_yYl20YW@DucfGF;d28EC~PM;#iex7;IYof
z391>(m%CA7VtXR*(g%%~e7oZh2plf>BGR++cwIL9(yikl8nve$?(*EWXp!JSZK0m@
z#+%5U`RoN&%gJXv$Z!^0b6S!(NTIeR@A}|DF=q*=Yus;XSpB~vglL%#>ga%14PU%`
zl`3iIk)1{x4(tdRH{?P~jYyzDSLAR6$CD<%Cr{tp+}kK)*uXnPV72&zV2`161wc0h
z-}=+~=t1Md*g+jR6#y~<KDw4ls?ds{n@nt2CKxiF=`*OhLD&y~TprrDz}K!Pi65-z
zyP}^5USB3)djX)4ZhigcGhdXcy8lsjZJPCtpL^)ZW8E%FB_{c!bxttZ_R=G-pBzi5
zFN&)r2yN!Q2nL#L2g>QTdU2|*!l1z8xV(DD&aLcu$>91Mu;`U5jp=gd&NSTY39$YQ
zfJK9fYSu(wqPpx`sYTEBHMWMTof8e<2n)G(8jokYrL3)7bEIAu4tOeZ^IL_Fbt@MY
z(DR&u!5JUha0#O5mv7RSr@>VFs9R5MNQ=iN+3$!IzTOh_-x@7NANIYXCy(_Y7~(YO
z8Uu!`j6DbM?@1xO9*Rm1d|lcSyWY1WL34)HqCLfpH1GWgEY|(rJ5%^Z?0l6zz4?Zs
zK?MI&Ct9A9YCJK^n^t&(7wgnt35llBUuYGlIuCgm-K-#UXr69R@qIvV`8TR3+)rmc
zrIkJ7N2<z3O{;jHh9{%?L3_AZb0aO#u<8>jPaY3)Sw|$;$B`D>sJSX6(3`sc!ZEt+
zu-BIiw!}apQ4s|CQMy$a){81j%AQ4<pgXYZPy=9{762~3*<!G#m(K~II=bOIX?k*J
zhU{sMqT5jv_%?`lR_wu@0N6@8o>R>a`>nf^Di+>rh1D-ovBx}g+QeY~_4Q`$W}hho
zHU5L{s~JiRC}VFl7!dzMb*{4xpSE&823S$qH_c~hL~Zh$x>Dy@QlozK06$u+f<~R0
z_oiG}3EBL$SkpV+nSBhKY!C9uANK~Gi(NaE(*3K)ArH7jQ6b5mg>P2pwERG+Ih&(~
zuJ&kmSuWXKlbsYFLEufhuUl{HjeYnVm5H6mQgbwavtW17HG4E&MLPAPyU-pdn^AX1
z)iMpXT9t4{`gZ}P+UzBd5Ii?5Pz6;mEknns-x1K{X0KYlDk@}UMI}cc)|7eg;8R8Q
zEha=i%>U^>`Y84te5qQse=6mO(|3rrz0D)oKq2%Y{>EX4q~{1=A7Q>Ao=^mCjC?pI
zJ8TV4ZThZ(xmMrQRQg?j6b@O|6?8eN7IfJ=#|2q}O+I(--4(Mw=0o$CJW}X%FrRVX
zD7%>_=q;-7GLJPxh+cDY70)~*RR#1i0FCFW&60igA{k|MKaJI2w)f~d!6q0ztmR;a
zmZPC>YaCdQby@?i4}iJ6$ig>_>E0p0l7K`MBIvd2!ZQd8S$pj~Ll!{24Th`Eon93j
zTTVL8b*W+#s5KrAxA^=1`ZYuJ{JEya%uFqA({*8nuHczRY4cWVoks^k9|*UF68t!+
zsg;u?ZR%|BZ&px7sBXN*O-uOw`^}Oay7<vAk*CCl1Bn+#YTF?{`@NgDh>*=qPS9pk
z%m+g@irvu<cMEL#t)u8Ykgh!pA=`rSF_to)y1yWHY245S-~I88Vc5l2rED;`$QK%W
zj&D83tA|HJ#j_IH<O_R5oP{$Tjc}2jD~aiW1&Kr_6Va(FbgCUPp~F70n-E_b<%6m1
zF$1-P*THHTw4=!1CCXch|MJKFg^y%(+_^`!M7`x{DA>_l8*PRwObJ}?aE3WAZ4o|Q
z2>o2pngvIn>DG2P&pG)JC@jsLlDr68u<TJkFr$;yWgo72z`cZh&f_SW^MlZB3zo`B
zI4Ib{&SdnoffaA;hJyWv(HmXS&3V$5&8@W62>N{F6C;whsWm$Iz~G!?MR>pD2dIh2
zQ?sL^>mN4-h@!C$x?jW(G~~t{Z?KDJBW$gL$9#V~LG7`>xo3Hv)kq(c5Bx4&G7kND
zHa(v|r7exq!{aZ7TIOtY@qQ<1m-&yms?W{iQ^ZxMh?t@F&bTkZdPP3R^;N$M#yd$A
zfv>MoM<4=q+*&*OHtbg%**NuP?7+`}gD$sYZi6CUpxWIV!S#o5QE<V%joybOmK_>k
zyU$*T7(AhmON*PIiCWHwU|zUW0Z*KsjU7CO;02^$8xG|To|<B*G=1THtkp>Rg%$l7
zC6}pDTWJ;DPG&W>!Y&fn5e+9bdx!tt@0p0Zc(_VwogCZiE6~xN0q@s0j&(Gpg^Mqp
zsa(jSrAO;BMyDlUeg;tSJV1RlLQ-j=()D5|R{zu!+0$l3eZMWBD~h``ONr3gO?aN)
z$emjfi7n!}p&e5_q;W;Nx0ca{*bx!Y+?dsku=Sm=m$R^Vj<jv`<AjqbS^3IB{o)mH
z(w#NNGLRduX@)es7hg#J@bBg&2bu^&%T>Dx>9IZ-i###i_7D0|WN5NYfi2ESJ7~W`
zB+Ss|dVzoHR~BZuFm=zs*Tql}d+Eb_wWWF#XvY#mF=VIR<|`}fdS1Rzz{J7?0ZmC!
zEp-QUsRF&<6)gICqiUlwI(!Rw@=)4j%6aj{bE6HXJ@Fk|kr`*qOHIwX1X2YzbHrf>
z5zBt9o{LWXC82{iXH>tOQ4fby1JU}lszpw`(2T2LUfB$aW!J4?s&88NqZXK!;}Z@+
z@GEWLEY<;>7Z2RDz1x_c7!6o<5A6!#AKH;RZ;j>lmo>&gQDG>i4!(bwtIVFT-<jm^
zoOG(G0~IBHTf9$g%yl4+o|PdQbO?CG@6-^zA-TI-6aeb5*&y&d)ogFCm_en?pxEfe
zo>0-8y#CquUd`e)-5VuZBzN=K`RQd)pSBo16;<hhnyAt|+)9lX<SO3iwDw&m7g&~D
zq3Bb|x`%y<cfoIOKy5U3gS=>-*5<v+H0Lpbnq8+W+g;{9eYK+sP(oZ*2mMdf9pvXH
zfu%+leQdQ`_p(>Ce&2T5NS{zPM(%R*;(TwzD|7x4PcR5y(Z;gEO(agm-na+3cb*(j
zRogbQ*J1{IetfAZ$Bt0VGa%c&$qYg1Y<|cys6O%}=ZRzTJaMIN|16M+uyU*}Yt1!C
zXh7_^F@rjccWU9~83FwL)jFTJGFDo8T!k4>x>_o`Dyaxn-b)l1G-P>T^{IO~O`=NN
za_W;K?zEA~=hs8%s=9UD(wk_(03Ri6pOa09b<OEBl-JMP418)4CC%!q)6!~Q_kASk
z(5QOlbN{X#iT9$QKqozHmGJHUZ0xbR;U02sHA}qp1naN$Ic~W9wBBKXBLw^Vj-r#C
z4)8|y01rV0R{(bd@B1G^=|G3lF?@YEkyqqfc7&)?N^=rzKgV#?<$=gx)M8UVvB^P?
zA1XC!6+NEnjH1U7LLB2eLEqD~Y^eWg(q4Rzcx#hL`X=#StJ%yaxZ0`x?euz0P|+~B
z1Q`1K+=7Am+VXT4+%?KScj2^lFF@>BuXWAK#SIaZ&g@Y2$tLn^z4x}UgU8AIP2D9@
zIvV`lfA4Mg8+OF$m$TP}>>b}gxX#!(+*jOk4V~>M=udu#T<h1hful`J-*CJ=(ik3*
z(g3lUX>H##?YG)6<p)&Imawg-f!&9djd|Pbv_lQuMKo4ELq7Qr^Rs`=n2CO9q})6Z
zd2cD|Jz!KRg!@!&EG8W+1>&Mq+7pyu_Ecr^hM%qWq17a=;AC(v$gGwUVCwNFol#e;
zH=mmq_!d_3a^OaPw~ZuEmg%bDKPNc2v3@SB43c_Qk2W3?IbDoPsHT3-s%v0yqYwzo
zmLGaZq2xx?dSl=j)pA3SD_Ii;Gctfy^-NZ_9ytX_(vuMT)3uUQ4UXtrUa#%SC|&tF
z+`1~WNGSY9APPm90cbDEI7CF<uW6}FuWc2qYJW;Nja7($kP+RwY&mpVzs}%ExOLY0
z+ZNmkB_?<E_RhQK`%tdv)p@xwZ#|`<<x&%DT1^IYQe@rA{SUiT&`Hi<=E?Pms_M~U
zA!)KN+~v@ww7|Lr0ju;SRjI7bLkczpRW^VWhR!*UcNYTF4-B?OnsLi5(@4B7+$YCG
zk>vq&pRzi<5prNQvT0}Bexvgc@inacpPQ-FyG1-g9?e5+l%6hiqyIEQ(?%qy<bJ>o
z`|;Jas8ukZCxxNx49_HtXiktouty_pxN0^q7_4*TLX(5vxwLoWbpe_eYLmOd%f0H>
zCn*oCE;@XP@Z{d8c7_1?&mTi)e@LxE=HoTVmAc<}1n6z*xBAS&-ygrh->IR&Gz;0D
ziphVJCKn}k#d;0PW~|+)Of*`meDHj6YRGc>X0>vR)509yAesV3?3^S04X5kAR=j#W
z0@|9BNDC`mCpU8M@%K;Y1OMb9aea;T>$Eeu)UIY-J`#2)wOlX3ACoa@Ua@a9JALFz
zZXt<SgtL*n_8p{AN|G}|tx{q0`%i#tgma-nPSLnc+lF%IevsyMNsa_)W@9hPToS+M
zO-(EM@lJr#Iql8D5zIlp?de!1hlQ_4C3sXQ+JXE50_$~!BcWsSzQpaEIuJOCb=gY8
zhVy_p^r78X%hwXj!NsaOsuPzB|8c*GWfOFZ@X<-6p(|tv=}mGKAokDJQ1G_W-eid_
z73s0K;Qrw}Tv(5Nc%44C`FnR)&AhdSr8fzCf(PTgc=UirxK7=xxa8oO5!bdz_LeIF
zS+^>TJXd#!)NX-(8!j8^zS9OH1l3xkaDBB?eKe#m4H%$LKi{~|%oY7p5gedD;f*;Q
z#anvL0V|#wuw~Oa%18yoKHGM16pck1bbU#>U`KC^&DFM08(cESXf1uhbu?c;G2%Xp
ze5iC&OuNFI!?V1jblMPwKW^W81T^T8{AazTUhH~^^kRq9l4mt&rw#xgTm8uHo@={+
z(7Repyt#PnIHG31MGmiSzu++3*+N|-|A850D7vlRzKrBZZlRXRZa0$l=hz04E52le
zWFHA9Ivv@I3v<wiWu;aoAB^6^4-OisSAD#t?Oxb;O~XJYxvGHXcBRloG{m}17zd?k
zN<SotKG>WF)9}h2uP5BJL=9LPuGWP1his?(-orZ0ISMYV^SlXkH~THKq~u+l#AbQQ
zJcl~p`kH*1_bvf^j$La>n@wN!pqYzY*~gCxRAB$LaengU6#6i$_<HG4)IR<(jl^f?
zW_BHX?^6xTjfK~Ny_sAU*F$4o+4Ql{W^CRQ<jC=(qDFf^ngX2Y<;``{WaNlv9C&-Y
z?bNp6z(S=JgI>g#s@X<G@(8fx8c`xjRb?{WkiKmwXtG?o-{?2XWGzOOE_J%YmXvIu
zB;zILqwn>w_PUgf1h?9szrP6lRy<?O_U6!UzMz0sN~}<_0OmiC#Wi4ht>~;Tbjhra
z?boeG0={V8;rD&wzAPRT&Ab&(n(Q(w(h!x_aqTSr=@BQ@WTGViI8d)~RbSd(2)}^{
z)wVh}X9{)4c<rbITCKj0`jNo7)BV6>L+m8D3vxCeFHtggd-w+$Tn@q?m-4-XD=^X>
zwM#Ms_TQ7sNRpUwN%oQ6T%?Sd&bS7^UAJ5S8(Je0xBL3d-aB-n=`3~{6uCo>tYBDg
z(|QyOBED6)oG-Dl=fF)&p;@n%I17Cj#tUcQuXf-iOHgk~*P|~u#{_5M{_dL3NsYwX
zHXJATvW;2;{EiP;z6A!=q@QTs?S7-;i-LuDslw&jstSFAPu1M4Xq69CbEK<qSeDYx
zwI==rkPD;Dk3g#Yu4yH%IH^ua3YEYmc9mG)73D71Ha3M~KS|J<L~Ci+l9oHHVM{-0
zjKgKv7j0%@{MLwU2oEJ@871?aKY@SyCbZW!@@i81-!0V&<dJ-&XThDca9%*l<t-Y=
z%C);15f0%&)F@tP=HFZ>yYol&y)ztu0!DuIOjlTh!wRfuhAhg5YaNYpmFnZh4_0t+
zaT&Tyt=)IPe+{ud13P{2k>I4KSC8g~;&K-i7UmZg78YDBw?r}X@x>1h52rMQg!~RA
zTHjMvke2S!v$A?gM@JVxOd+@@ExX50Pwzq>sEe^t@<1QkE#hC5;Kaurcf25V+a_ZS
zre8GOHv_)Y2rdisz^?yl6Tf?N7v_g-cxe8s?EGzK+|N>mTcfv^i#qN-{S8T}c_V@|
z_WwuJxz%2*Z?%`dqKWysi24fG1NDF9lsi49Ps?yy{y!SXTQRm$>AZEr`^bj>$TEKg
z8n2?ro&V-?JLCPYX!OVRu2-Y}@8R3~s&|m+M|}c8zXOc|CQevf4o06W_=p$fq`vs`
zZ1hDqw`^WU=&kJXn|04gS9nA%hGYJT@IAN7<iBFfc>MU?m){D?3jbpJ|4HH-g&O#N
zY`|H5^Eb0RL(>Tqdmw;4E-*Z?+D)TN=HK-19>|$ay*)eAku2c77guYr`$JVgab5_#
z{hR0>)8L6G%yo80Rl7R%`PiPtmLNuVlh2JC&6LyAPhkeM2M{2>z<e0yez#;U?@IM@
z5X(%u%*;S_+NPOSdll0zX6>6h;VQn&6-JY8h;GH$odtdhXq6B=9~+(_Fo$$VIt5sj
z4Nb-n)}mv=RrWCc=>-ZLt}h7%T<zdTI-YZVilrdDy<#x_z})TOv5gv5s^0*ojTA5t
zNIIPzaMW!wr28yuO0eRwTt*1}LJU9n#NfBz0l#qT@<Wz!ZG7f$obKVBJFyUyRZi{-
z(K)*xRThB!>NnFYIU*Cd%lR^UFbBr_YMFQPCNIe!yQALp&|q$umx?N|80a7j4)Ff|
zcxKe~_sJZ0jUYG1>3mbUxiHv4^sV%jkVvr53+TzSsJD|`{icX0hW8Ia;HG49hDO3A
zM-aXppgUH3`orqaFZlY-{j|yZI}CO{+~E@!-SNUy!-!XaIsW+cB{>~QPtIB3d`9QR
zES<C1S!5ef@A|zu!<|ojs;FRK*Kdl@^;BhkRmdUgiJ-gP*GKz#`t0LgTu7}e&#x<C
zzV+oXIb|#jd9|9;3N{~FBo`BY6%{fXoh0b$*{o~EnbBqN@a!<#oe0>PsfV)ArZ!jY
z_8v&}5m$qQ*Ghf04!0tb#hmQ1PfI}kAV<6Pna0cVqrlc9_34uVy{Qbc9tXqAS@=aG
zB7lZ$G6BIQ(XJR}n|yZ8<W|bL9w|q&N{vBfHxv-0d;nbCtM&X=_|~uiC~Qk?=~4nd
zDZWMJ-9Ihig$2CPpdz%uX2r3`%^c?6@<Cr-3H^GxK5<36nxl&CYDDy2bAZLjbM>Hg
zfDsFOHUm|eQFEU6SNZIUz9r%sQQobj-1>uNzHu_`%jy0DPvYkj0_P7eL{N4&0Evro
zK{MGd;W&_ovzr6%^GF8^Ap>F`qK8hhaw5SH=LKQt*tm%2_M;1I)}om~?2Fe{?|`On
zKMy-x3kXLgN}yva*r2Olxt2y}xiVc}KAnjv1$BVJOK-DgJg@AjNsEXE5m39gT?AkA
zSpaEL19k`TknUmIi(*95dJXAn1G(<t!?mclr&LZ(ZZ~g!ay(V(SbG2((?p!Y6R=(u
z{BdH!12bi)-nGN{4}P4%j_!VnjNx;Qhu_{xYb-SM`|7?i>U^*ikZ0fj6Ngt`bauyt
z_Dsa7$kS%#^O={`>6KecfSBMMO%KxItYSH7W;qoZ!2H)s@@wo_*+e^TM@Mo>fbAF~
z9|p)L)_5LzNnqe9bY&tcdQEjNv?9p$d^pBTC~SpjCNiAJWUTe@WkH8G-XP>HwDDZ*
z*V{(t09EYNg?Sb7f2Q$U<`DVGd!y1`mM62j&f*_j+T2NN-v8_?k^x;hk8I6)yZdKW
zmYkZNQek-5t%r|c^Dq}saz7s=rRDg~S%5&hn}wxXP{8ac26ch;eMnM*!<Rn+3^TJl
zV^+jnERr2&pUMl!CI)IHLV%~d5O3u{(5y|g@iA4mnFh9N=PIEXIvQKSH|GJjfv>1)
zHBat>s~dHi5=<_ADayCuV0LzIc_uy5_Wn0l^xocbkVSvJ<Mz8(R*Eo17H8P&4Qm@x
z6dF+aTOSIXCx=?+d9F0^ii{7yuFgEUm+IHVj^Q`=ZH3gDWu4Bb6G!SbxQP0+4%LyG
z1M)6RKAH1LPOuZP>Qx-%IjOXcs1w>f?i_5yay0ASpX@vo+?75OxnSvbJgahJe=uZv
z9C}?>fGcww>^`<uFnIi$`S>**ol4y}b$U|!Mmg0<g$s(DMv>{Qr*@lKkFJ6GRHhmD
z-H9WK#b59({t1yP_>mWLfXov93c`N-nW1kiXy2n57NKe4{J2wjW&281B;MUTF_^V0
zFRW(&Raw2O_aeEmnnk=p?b9DxuT9g^j5ZWsl%S7!Z{*BBde*%B;~gc6^74-+a#P-i
zx4;O}k9a*%=4I~G?bhyk<8{bW<A}zi6!f+9ALqPS!@<SNDXgf-Np%i$3Xt^#0t6+H
zQ<G^QyJILUvFnHv<6eIj#z|$rwR)&XOhWU{yZ-C<?N=RdDKB=SIF_<SG@(}^QPz`L
zLL5F>pBj3Vcs2eYpg#=P(ADi8$-<w}=~RvC$}OmxV?hHH96mA@SSOgvDun-mfBs%q
zS4KwWp-E=ux1eA`Q=H5n`V^xsDNCm#EIjqrliBOvn0Q-{-`q7S(ljEoq9UxX?^R(@
zkzRwaOIiw3fiPWf1QGe4mc*=&tddKO(bNlPDk#S>@p61`qtdH=IY#v7*~-NEP$q|q
z?HY40eF}5EFx_nyMqU=<#&IJ}Cb~a(T?&h?$j`qq{?h|eqcU<CTdj4&XU;4rEbIuV
z`jREb=Cmm$dsmHy$}P7`CYe>%#Gk8`F*ETN?4e({d(x-)reji!W0SXSIx{PHc2BdU
zKbG9)QIYl{qoXT93jXuQPo!6cK4@!eEak6ms$s8rh-%qV&dA6R5)r+0w)S%wV^mOn
zjWam%7#kZXp_I-4#bd-?Z@o$KPrZcJ9uP8oYKgb?&0p{lS=PjaG*(fvYq&k2aZKK`
z+ai)bs8;#(e2XVTR=h>-N(d`VBc5M>xa7_2c6&r$PPYjs5LrKJiQs<2#5XY!^i`5p
z>+Ri?bBJqME$ft0BL1gZclSl}j-*B8J3l1_WizCMM}BdHyzMulyBV3V-6rzIC`J2i
zN4(f5!&I`P0F4VJlIp+J(9{f^4~V}7OnJrkhfjK!FdNc&l19`^O-;==P!sEY_HtP!
z{>X+)$o;j`$Mhvy^UwbI!-%wv4m-*1&pyQS$jdEx;D!9~ZN0=4pTfc(&s5>~-TVH>
zQ$nsQ{*Wmn6Ikhz7OMPSVX*u*;ziwxR2`|;XX1akWh5n}3CbE>Ij&rhx4X~G|AvJx
z;S#T)v~-fdHtdfPdGXs-=xqnU{D<XKmZB}wM~xvP`QQE?w8>`W<0(H#9e<g5$6gG{
zSV$MdQk{5!U=Gvg*U!W|H=%FU!CS|B!z>UGHIB*c+4fB~*y*knjNdUNOUb1_s&N(I
zY-1>X#Zmq~^k>k})8EIubRPU?O9qs^eojmQa=DA&^gZi3(Np7${BM{TMgH~zJ<C8j
zbvX`>kjzuJe<7v3Gdl`@x}o$&X^Um!z*7T*y7Di!fBZJ^@aDwYdY;g|v%0H4n~B=d
zbmgo^@PXLZfBVXsrzdU-Q^khjqROEG`nQqyanb}aCG19>)K|`4DtE^1^a$Q@$0$@S
zXLfDzG~kE^_5>Z5f~6{YBw7C-LutlDls9r(6z-R|>deT02!d3cpBv5BwRS-~)mH?$
ztc5ZQj;Q+V|I+a^x~ZKvV?S}~oai_v<sZ`hq&AMHci;=aDyCy0O`SOb>MIO%IBqk#
zATKsWEBsl!ezZZaNh1>ScZRdkwYeGFKjA-7l@eXo6REA)u>zD98_&`Ef_hq53)d4Y
zqttai4z6yno#*B^s9ZPnAe6k(R;~gXcL?r3cJo&@aF0uorOGLw1WN*4Z&U{98Z*nr
z%MP>NwIvTL>8iDJBnHPhD`Ucph#=3Ry8vr)Kb}$ZZJyt)wB`GSYy5s8M^c_f_n@GQ
zdxXF|P<59*h}Y4kvodbxkCg;eV=OD%t2$R{A=ZuH``=vxUV0inhsJNP>74GpOK8^X
zz0LUDTO$=<Z&_4pfsbO~kdErjt;(4_RD9nQuNFf>&y}kBMzK~VrDwqol0hvs#~)5v
zKZsbB<c)cAz%4Bc8D7bav!vxP5KNW;EtByHw&tYLw=_wllZ6LLoULAhR?Vk_WQHQ>
zM~QlW2EIWW_~4OPGqDE{rMBf|*7JK^PM2RdSSq4b2B*vE%p;<VryHY0a3WJh%XLpn
zzt#&%J9*eWwS*%K`tr$e6DlpsMpkwOu4d!52p#B<xf3jd*x}3v>(a{|4CFU9ih|k6
z*Q1kXq&c6udt;>WlR9gd^=;7p{$<iprq(eTIIsFyIXH6MfUKu|vg}?kb%2uII|mEk
zC^GfUWwpXo%CC0ifsIRA&dltp_)!$o6bkD7#o7q#3t0xbD#-pNeV!1o*49A?>@-A7
zVzq>(IRLtTN@#3ZZw42hU$s&VG)sIj1WDa;dznEdn@-v{x5JQBnfs-&^zlZ8tCT-`
zuT<i}Knh<*T2Dg7`5HN+YRJQ|-yh>;-ssR*?vanZlGLH9d|f8+lyhEB-r7te1hg7p
zy>k_=t}ES6BdXl^V3u`G?@`xPMz$}7?O8+*4WOH1@p)a;m-~-*pNhJOBm-517lP7X
zw7}p3oZ%fY9w82!6Kh?gEl=J=s!D$B6nG-L;{Xc51Qc8uX-Anww40WDNqlfjf4X-u
z?`QGuvjhI{9Bjm^@8VaOux%R7pv6Xzy4!Hsu0ik-09M*+u_y9U#Hwyr-_vyE#YT7`
zhGI>u+lHIENJ8vUm->cagI+>~mjP?}3Ls{I^k@7W3kz?d5fwMWbd%edz0z<_>&jls
zf|D0oW4y%|Lv@yFV_kFf-7|L2G1N8hq;1rJJsds+kbOtHJR?IyEEz!#o<#YSYz_)X
zv=(D?Bj`hgeV?w7u1^ZPnBREtS3-{S<dhg=K$ov~qZNB4ce&p<a~?TTNJl5hn=WH>
zA6(p*gwxFGf+!RQN`$A%Z%>qc=_cx(A~mq}_M)6Vi^u`O$mq?MH#2yWb22VPgEaeU
zv^QRJ=SYf+$Pi6)MxkLtCRrPW&+W~j%3n7}SVx6J=7BL;Iv*pB+(P$-m|Ii&m-Vm-
zVWp>TU;o*mDpO7I^S>Pz!Z9Y6Jx|HW74%!xlnmn)V=N`*W_p1Do=zfDeN_2Wmh$JT
zesRMJ$svG?3xO&9)$1LNji!246*o>no}jvWX33S%9_^||65e7^;BWIGjbFQ~a}>c$
zE>DkI-R%=<H+Z@0RS{mPsZyVNe2QCGi4?tP0e+hS*`F>FWhNbJv1YgDbE($nL?R@d
zOWkRB!zW5TraaePvrGs}(68K*RdeCq6K(6Kq#x72EQz0`%Cb>Ib5a`#MVa+eT{cek
zqqo>Eb=Y+-2kBdh-boGuCY3uk$0poUv`4*2fI1X)_D1&RE`E$zTJc}RgF~H3*kd|%
zn)+><!xDL3F~=rlBF7ABxLH_Zf3FJL`MHyrVRs4lD2z%cICu_LI?y|_9<k=Tr5l>z
z><y)HAQ7`|w$Z-R4C`@Mm+Z1kT~EmcW2hQ?TwBD#)lkc6J0hl7!oK0Z4TRF)9K=XN
zj*X|PsB9W*K9}oEw_UbMq$hdAxT`uot|F*c!QFNpvT>oI+!I5TF0>GKwLCXA!WT-|
zezaU2u6~#SYM;7?gsqt<>3_It&_70-GUpw$a6(S}JUJ~=ZT%_y#A_WHd}m(5lEO1p
z52e?^Y)%e4L8cCP_OzsI56$5#AEP$VU%>WcN^mLA!F+zX?0azK=};n%_qQW{R);pP
z^|i!txLxp(u893Jl);s4hAp7%Qjgk^$hzb{b;*JIc^BD(i3!Mu)Jve|#K}#e0Z>zO
zBDb(l=tgY$ho<{Y3)h@uiqt~sE7u8zwe_8jLXl{L%f%RxvD-UgTy2p@3K$o3<SntT
zS3m9M`Gt<XT?q*8vP&XzEMrcdPBkc(WT9Y7n)+M-xEyff%>@rWC9zdcpV~^BFYi^_
z3(Pc2(cN+4s9@~R*5RJ~Tsf#kJ5*<F({%4i4JI#OT`cb6eW5}GWSD8NSbX)Pv`T|Q
znW-suUUG!oMsfsBD1-#S%i8}KM7-O}Q(*9V(vX!ImFW#wm^V{;%g2mPsLt&oGo#WQ
zY--nfphYYxHmH;lYZKe3hi6<20|@6Gl)0ja;<$MxKTR;s$5<zbHFw#aL#or31f-G%
z8KY7icye=7vUQ%+F!El-9bU3|T76Akm<22c=cXJdlLId6FfDK9>k4xXDzQJ})iB1P
zk3UwPUTiT7lSqM|#Ywn^_KKQEcymllwRL%fmgwGS&VJF^_~(GQyk$>W@W3Q#Dsv04
zKzyVG;pk<yew)C`2T4LHDuq3lN}AAVWJw-Zux8GwbxmdDm?GB)*7qpt6o(>+Y>6BP
zULVm6@_4V6mXiZ`Ly36?njTgv&n)Ulxy2^ePB+FdG4Fg#ZBH>@_h0eJ_bXA@`BIJ#
zvh?tY5VSorkMr;hQlKSwEH++dDds69wA4~A6FG~UOEGkd;<gRbt!T}x@!_{rO;A8Y
z(Xmf5Dm$GVwDDCI4wGqow}7~{KLr`9=b9&N)iS2)ut8=#8>SbLPhsaT1^`4X0|Lp>
zmPy>rufH)RMd+I;DWvP3c%>**nQlrz1_De&bt`y=;**FfZQd4_pQs2FXz(aO;-3*a
zb`P`7+r1<huX$U!burC96nD8vRB1l_vGU|Qtf9W9-o32SDAGCd>~%!7Z&)(gDmU3v
zF19?-#14ttPpxKleOGZ*-hRvF7xqeP;8U<T(!w)>znq#_sc(E5uu!3#n+<hkPCX7@
zPmZt!frYhF2fOf9cq>T<1Pc61g35H_5;SPKbnJ{#2D1!HNM-06W+j35E@k=Gv4P#5
zjpVIPvmGQQ9zu`i8rk0-!K8ZctlOn>Ye)g|7&0T;oedYC&o%;C#nSfiuF+gQr6wLE
zgB>v$Hg@g0VFrl9FW<+y8nsxY_xkzODA1-`LBz`Wjf=Gw5nm^4Ot~n`MwuhIL#f42
zNu&J>0hCzNXX`pWA77hg8;2@{y6=T=0WPzj#XVJ?=U$$&;R&8i+so0gO!X#E7G#|-
zx$A%rNL%2unDtZtCA5#QG>EDT<oZDqwoWgB-v2RtyOj02eRC;!RdCTTe7ocnAJmM-
ziW?_0dt5n?6h$_>&&@m?6k2leJ<U2ci~Bg;L4c}Il%u>9b2TZBB9**y00sSMnxJpb
z|BWeS`T}y%enVfBx#w27^_?4H62UseOHOKB+{vq)$$BQ=(g$*utDkm~Yiduh(W2(b
z^>Z%_#p|3ZHB`Tnt@5r>cya`~tvRiiS}U%h;j}#VM=7S+TSV~gVVWE9Ub7v+TCkWz
zcY@>Vt$Fdw9uzY#XMbJZjFVh3olius%USsj-lWf)AXgL!_i-8Y^VO<!v7`4XnOV0@
zEX|C~c8(a5z@d3`;3EvF-ZE0pz7ttwlj~U4s$@?@CE3d$>>4>ZrKgol6zp8;_$F7O
z#*~P5pr}>ZP?5LXL@Vz6yQ_2pSoL;XYQ<ekXL3i0@m}2*i}f~(Fw098ta2bCc2i%O
zpKRc8c3{cyUyqs=lc-BfZAuP>Zqd3UluHi?C9UlTuSIE<d9?I*%#;ukUNJBUul%kR
z;r_uwN_(1}@%OV$lQ3ADAOxts0($S!HOw?`=hO<QgUIYsQ>L958~WNvyq_6JvPj1}
zJV={ePiAC;!k)pxey}TYYrPHymQr)!MaxL;?45iT%J=`>?07twZyntWav%~rjl??R
z;5s>5d(jh{hy{Z;D>vk&Xf8Mck+9t|OkRJp`n@DL%+*XlDa?IP4+yq?sioaH%QE@H
znA}Wh;VU#Cf7+<2#$?|gXlKTe#J%jt;at;RvL9-0*{H9e+}roT(&gMObB7<E6uZj}
zAtA9H6{KL|r!V;g&d#kUoUt?=d#}E7^v-*%t-!Lyv;`mQBBOMRDgC*+t0nIlN9s@%
z9$iCZPA*zmjeo?RonH{!34VA1C0t@mf8K@nz*HldpY`*opmt?BPbm3WwvcM~&=AYs
z>n_6tC6_ASnP=))I+;Sz%5WNZLw8q$>Uk}pW#xU3cf0>=F$>t2cP3msv9W9?a%d--
z&JfP{+MFF9+xPZRk`>dJv+`0#WMwJliEjnQuFSK9jv9{HmxWcMz}1(g;u%%k8y62|
zC3^Fov3(6<f$66kQ>Q4Y^Tn@kR8I74*=bz~RBWgK>|B68U6C&7_nqs$jdp%2kqm7v
zo;nhYt?b8_q3rGD2#j*>@&Uc3mh^k3fDB+67Qsfw-NyLIn5}Grs#sqm;4*fE1%#K9
zbzkJDog($0w{ozXmE{mT*VAC%3aqQ5pvYBQ4I3^(=g7fdmOVtoND797xM{24QPhd*
zNLiGUnID(v1zY8B@V9Gs;b%H=3e6%U?~bObs5gHoy>o;;F*(S7qT2OQ_n&|OGqA|1
zjV*O4!d%&Ww+<8Rx{9k`ju|W00LCqgO^Q|WO9EZ1)=UVmL2SYck|lG^o?xs6_ftBg
zOsJqMU6Z^9UtGFHn(T(>yw{btR`^*%4JKRGH~UmZW+%|*a0XQKFf;V+z>0CPFOClM
z60?+}<o-0F)tE6YGE;UxC)iexa8Acg$Ly4@h$GjdwYJ;jXiD(QbnUh6oX%sL+u+(I
z;3mdO-3)MR1)RMz*|pGubGL#wUT@ZrGkVTJt_cs0>j5sHv3`m$epw~3L|3EX%}QgJ
zN%swU^t{D!P6t;^HthGT`XdvZhz60&Q|pfHQ$&sGA!`Xc(?*RD%(f*kX?!AHD?weo
zSYufFWB8@_*FD=~<p-W`kB+QcGQ&}*`Htzvg(kr#^1_nw5hF_j`nv7DM#A8@UNVG|
zTdo)|o^?q3JGTgx)53e>^}feEr3Jv?svySPD3+1Oli3Fmd(_6JMxpHohS}S;$^K)y
zCkiy)=|Hwog-5Q<I(l)xb@rYQ8FZX;e(K~`V2De)^lLe}QjIzV@CLNK7)@W0y{^+W
zZ>~*<2rsVg$5AwgWUYk4tK-WLVsw^+%Fms6Dia5p>)UIvK5o3YybW()j%4pWdj>9F
zlHSQ%;Z`65-MR67?+i7*^FZ|{`tgf^4k7{X$e8N0CxwzGIz-ljOeXWOorOcLx>taE
zlbL2@&y`yFwjNHk7i*hqNSKuL*|V!Cm=A+=<Byfi7W%!58#<4EZO1K2ea~>7VifmC
zGb`6(IHLnnG&^35hNR=vp`n<INpS|Zy=2^MB|=$aZ!saY5;|KGPtNL3#y99as3=uV
zoU3rzLiZ*r=93u&e@b4rtiYjl-O=<^eqQUJ?2~-p2JgxS$q;&6(I#d2rLtoE?%aDG
zS-ID)77vAuJ~&U;XEwQ5Qp%>&LfAZOmTuojdoMbYhcVqZcpXi&Hd22yVs0kN-&no$
zaWq44WQ4o^zUacIzT(X%{q(%(c^RqtW`j-xpClOia556GqiX87I4f2Y$n@Py%@kqe
zNcmLN)RbzfHs+?J#Nqmjm)$>pZ^Z<5SDTv))(3|SKaMWi14^_uQf3KYEe6a_<U4{u
z%8rvQp)>G)y4q)Up`phIK4Yw2?yN<gZ~DF#C3*Ic#rpL8le0mob59pnTaCxzdEgc9
zbba2pSjWj&ZoXmr!uutzixR?wJBm5fz837Ef`IQQhBQSEp*hE$>{|9XWg>6R5@}1z
zb@zJ(`g~a?b>ZJQTi^r4%-Jz^^Rm#D<W)ag28%hJ4K*=6Al3<1%e;Kl)w28JsdEpB
z(=sn2Y@z7;Q^f-L?Ac|*6D_~|4o?M6HKKe)*DAL)_hzlS;)Gt{?|R$$jrqI%RzY7z
zKKt-dfoeD;i^T-`Z6Tm8`?OR?uR`b2gRoo>%CM8kr;g<l9Ue5ZZjOg-H5U_ZMl^1#
zI`$aAj#Y_{TnN6l?rPBl8aZ6+Jg!5du!L)SXN#sclhr;fAEbaf3sn&Trf9>V&z&B-
zHJFe&P;by6W%2?{1s!H5potaYeI}(5Ike(ufXlc~7Rr>Fh_^dCb#912D8CEap$Sxx
zSL%{w^}Yz~f_V?BGD=VoT2D}TR^`UP!ylpzQc`?ySB2rvG<k15rPh!JvU1}>6LdRp
ztJbZQpro2&XfEtO$!YxfB=hDt$XXkUQSr!Wjf7Y+Pl=V_#8`oGNDVBpqCqGjzK2&n
z6ifam(3IV-*JQHpU=@;Bu{9|6E^6Z;KYL8?SzfHazd)rdv&L&{cAS{N)wNaPRws^1
zu8LSOj<50BEU~=yWSs(eF)jknnraiiWa<`!h@$PyDOP+-00;fnjM@&2^=5RGMuvkm
z?{46H%#6%WDHBwoS=IwHvl$Ydd2v1nb^zXx2_*baUiutGI?CGtX{>B|idxx8pTX*K
zt1{_^Lh>wM=Lw~E+=6z)t<Ea1i0|l7f7hzp*JOzsT6^C_%e3K`&e}@-fur<inDG0i
zEf`D%`Sq*3E@9^s4lAR_E0Y!99Air93}n0g_9xVu>e+$0(#L_^z$0>BT{`OmR4CiD
zEYw#k7u@d_smx^niS;;n0xA6+6lIp=T)p!^Q-^+wASaF=z+DNZmRX7n-M_F&Xt8Dq
z=&%DkaZ{PJ{0CS(v|4tVuBN0<s@R@NIc?*(TbrpBo2SgOHqpB}moZRgZ_Vh=PJebH
zMwBtt)nr{sezfhS1&<q;vNw|#N>e-y^DFkx<Mma4yV%_*!!p_m{xOjO;NF3f#S#-K
zG>Sx#v8{qrhl+b@Oy+CCa@Zc{H}AQAs=<rQyLsrm@4*?qm-ZP|ff>6e&#)yucP5?>
zOV54o#eS&^1g}j|T%;`Nt(<aXT)8mxsJESfr#BLK&=o`wqf`t%mnofO=nQN$$<OH3
zbiwpMtH;woV)YQVK^7LlsA)$o>+_TAFZb7@6#xsF=8$M|X}>jQ0p$IiPtrtF>ylcx
z$1my-W8?ln_AlfNHF{(6ejfOgd*t2ugREVN4yF_R^;6;t?RGGzDP%En;3IeXo1Co(
zS_-)fLCR?_FR`KwRi;UbuY-?I9R*`iXfGZtY{5(E>`+|{vx0i}%2dy04^!Iv^Dlmi
zEaKhYMhoBmnKNsYOpbTqWgD5<XhvnKBCVLI;pMRRkE|;wz*Z8>2O+CGJ?aw?B=*{p
z3a&qHX8;_!o-Sr|xjSdMuM&ZA(-a4Cd&ZPp;5ufJHOW)B>I27i;BXdPt>okO1Bgv^
z%I!-zGp}b+FO>tHSJtPhw3(jC1=FL3N3%&F;Y#K0VN7C5d;S_E4)X=%Nu^C{j|FtL
zLJ3mGNDLpbDW+agUf$>970o+!`F%L3%k1y<rCdfYvcxegIlZ|n+w%;Vg|RYat);9W
za3D}rKC^7gRVE&9S&^HY)Ws=a78%8l(>XR6t9q?pwLN8Bft2St7|}9A#9FD-ezet6
zR(`Fcp_E{o&=edSF=4~6#RhF`t4A068<5KQ!b5wc+T8J<I&PnEz|$;v9F%x)s7r2J
zlze`>&5(Ze#n6bfprL1aqSTp08hAm$r4k6}T-jo-cPuWKnN_meW8?*F6UD*WPP(9!
zSjKiVq??oM_9(kJ5upWPLm#H(actP6zRqr%cabg+-H0%aFH`FA#?e47HnIQQ%QkJj
zowM0vTViEDxuWuJ=A>-rYO}`cR86S8RV9UO4`!TSzgT7FXWX$!XQL21euZeCh`>2-
zz`(=uTZj~T@GXFpcYUv~$OI!qq}FpZTdC4SH>Esq!roEJp7%;HDIx|30;s02wTn$g
zg~Gxvn$`Y6oF03-67Eqx6oS3F-I<EExbmHY>A1n2ObV|`Ek5`B8ag?vZpE4j{xT&<
zF<0AkF`f7ziV19isSlLPoZh!eQ9vT*m=8#^ld5_IC+4W_H?|wD-iP&|H*Zdw=63z;
zYZNGv5xVx}%m*`1Dr?#>LoS}Ko)*-$dcj}5*ldx6lmnD*_mxDoEtBF~%Ao>Jlm!*j
zGaQIYXgno(AxW`fmF6~K3cJXU(6(oBEyiDWaygj*of&b;c2yHyK65(K(kVk{fme}P
zeL_dUeV2kXwwpos(gpF#KBUgL_C-Y7nowMpEAMSZ!qie|f$Zx8I(tP-su9U-AVD0j
zHRz0^)va;?fZF0O+mSq0#3P4&g9-Dv&aZetqwq|SCxnBevN)p~uA4wzW}N<-7=Eb#
zw#>b7@LuO?EpZYkLr(UmZOUqreX3{(`iwLtq4LMTlE{Yst=^%MMjI9;yz)f5#W?L|
zTwG*DP4I-)RdPIJY-k();%BwK;)8P~rQ2HQj&dJq1$<C|@GnX0?bN_{@5O4F1L*=!
z)px<17VbUf%d^w014HyX#G*B~t3=^@u*Z*{6{il3x>M?+>^LWv6q8Et)TxhYIY?k6
z%nbQ{o`D}tszH3@8V~X96X8ty&shKpvnN+)v{p3?{4TFpnLJ0H066p%7f$Kll@rbu
zN%q2Wd3Vye88?@q#6RPAcUH)6L>Ql_vK(zqGwzJL+)72l?Ex=e<!a=Of#eMJ>!q9f
zt^BA;4kWxZ;>P(-3M}9KJ<F9%=N?NTw5~Bs;kg5r1^G<+s3;Tv5t?Eq3kkggGZdum
zD=+Olih{EgxjvKoD^8WAT(_$qsbaVT%f|L%Qi4d<Kvge8{}dvO$-RxMop0agW$+{U
zDi))6|LBGxA)n4-5{);UaX-I^uHy3&kCgF_xJ##!%*gx6?`NRA;_&RB*TG-n=kJo`
z&Ae~)mG*BRC5`i)KK))=^4GYTd+|juTfqV9imzw-SFX;TDjAa2m-52#k#IV$i_Nyk
ziqaj%ly^UOY??6Zav`IO*A3}^)w>jxPT0hF#=r$_t3o9BAs;e7iiqmYG7jT;Q@c?V
z(d36-IS@S0dL3SPk(pmNov!%D+G^tBz*N-WnK|V4p8%KOc2jG8PC^gAp#_mB2%l~6
zofa?8i^B=e_q^0?o~F1S8N^D@b6*v`6EH_p_+Mr0qZrcg=3f3>D$ZU?#eA9dw>`!e
zLrnLLV2(JqC*l64Omb{yGjED!Y{~r<7PHI03^4euF)6!b=N?Edvff!-*1Od;iCr&~
zIsYD!i7}?OL_1L_M9TEuVF^hSN=N}xWK%x8WAnE{388{=S+Io-5a+`kJK1#A-cl=v
zc&ZX{DG(M$5d!6ZNn1G#7g2~*zjFXW;c2Qde*S>y)y??7=!CTb4nzO3jNOvGcZb78
z-Bzez<ihH2Ge6cJ{#cPv&uUEjb09B7Uc*)?edIVVqq;9k)K@oP7Ej4{Y)MrA<asn)
z?6U$F!hQijEr^{;)BHn?e9!r3eMSS*<t|E&T*5@yNc#fc(a;S$i)PT){OW#0WL(r)
z(xb8fY`E#6BOunmO>D^9=6H#KOWAU9U)0ObR<U+9u`mLLZ2tTi#%><>{}K0=v2is`
zm}tz29Yf6Q#LVn5d+a!7X0~HyW*jp!GsMiy%uF#eGc$}~{eGa`mG;M8UFlxw{5a|<
zb*X!*s;i#rs*~l}Hdjo&S)4Y=c-f$`2vfzDmR6C{wHv7I?3*GSDdkFdL%!No1m56!
zd+5u$cJWvy(Byl84cxebzbJb_=oMKTR?yfoz%I?2=uGhGeeW@c(`X?hj&Q(t?JKsG
z8h>+?d>hAxC0}@NeL-4ClM?*d%IL7lQPJ9D&%)Y36<~i}WxIO4V$VTf(>b*cwqoIN
zG{2~wU++L0aF<NS_i`~M*MBp%mtfD`v-ApN>b=;!Qpl!pGL$jiq<klOhU}LSpEgF>
zC`T}M3U%U@9WZx}(%oFMwjH`)9q$(5mKMrkN?$66yXxeM_`K;d4?aoC`T@Sl{hG>=
z5+0H?U1^Us5pmIzGTyu-4Ul?$R=WEE!eI+ctGR>mhNhrR4oMI4Qs4W5T_{a5{L&dS
z@eaC&&V6%>4_7H<rkv&Vu@4HqZ1tvmj%W$=qG(8<=sUuS+-*r+)<q(+sxY=X^PFE^
zao(OKK%)ri%{Qc5^H<ODq2QFG_M*;d+RN+^!+2MPbl}{76f0gycwxH{kAf3_JfhgF
z3oI~2;#;<*4h^PooY0gvC*-)ld&@bb2yTkbS<oK9`1UDo2>-7lA}<@;)5$_NDv~1W
z9^{_lKbgSdahVrVHmR7dx*pQQO=H^0A@Y-2PZaKGDrD)sRI5!MPa`g!NOvEY-rg-R
za>~P@eOo)25hF$moAn&PToF+jhi>^HP*Ec9KBXO?l+*EVnm3o4R_Dqi1qUpy9%m}=
zS?%so=EoygZ|}vV0=vZQFc`Bo=)cG@F$i?gFGAIA6=&bFmCCMV%hfeB1{(;=dDYA3
zw2S0%PDRhUDyff2)~mEj4k5y_8#t9AqN-t-hJ-3$MSo`kQR5|0r?_Tvv@RCRwMm9M
z%G&3h%iBBMD>LLZjRh~Za@uMgpxX_f_t@hjzGnjv47WT(NAn&7+6V9yR&Q6zXDVBF
zOfVbG^Kb7n#%lI{JmSFaY~MlyJe=SS^D))1U}`Zte>PRJjZ9)zi=%Z<W6Ynb)@1sp
zsV_dGD*Vw(zFQaW0Vi)*h3&XFGQPJ94EHrT`a4WBD5MdN-%fH>Q_<rK<^HM`%b44b
zsk_tP{(B+kgQ(pr=&Jwp*^Rw}zpX0e>I&fRnh<S!!zV<BY;&JAxDpmLy#^VI+|hqK
z^XAb`mwf&y;lP#GfC!2&idb{KDh6yoYGL%6BWY!hkAre+@>-CZfK2y<b^opY1U<#M
zB>E0pa`9T_vt|0-tl>9|j?9+;?nTzW>hEfMuF3&#I<Ouc2~&g-iFAswP24ekb$Gif
zBR0yC+h=n*kB)%Zlw)+cX|eRjYHwT`cHs}0g}d9%AVG~(creCh3sZ<=n~gl)Q9@*j
zV5VBX<V+(XB*Bjmv+=%A!NQd{1Pz$TAJsjka|NICS{~tmO7~>Ey;O6;I>JoVpzYIE
z&7AH{FN|q9u*-KGZ;?`)vYeoEbuar`v&0mpT9hd=vaFkDrPa3_2cuY(LWPy*5@5NT
zdqf|xyi1456=a|NgMLK%>N(xyy3l&>M3fUlZSaYfE5y21yldLe8)vKS+jan5bh|x{
z{e!6MQ&_qG1GHDT{y?c12i;5hBIiKnwpg(ud6n6Xt^CF8_9=Imqq(50s=!&sDCqzZ
zG1V(<<7n5gIq9e}y^-D|^+bhxiqaAM{F&PDFFbEZR7^VJksjG|R5IJF{%Qu^DEJfv
zi0*G~oI2CHrpssk7(AT{F#J8GETYkfdTZTtlCzXz!I30PbWy|zBCz4<!!<*;E)7lk
znnL%}o-b!Z&ITp$MKR4=>@>cP4AfOHMKomklk|F#n&hQ<m@9lFA)3v}o0;m@lV>E<
zZ|KE972Tu7sNygk{UsZBMzwS5SmpLUa6s?pGyAC7TaDcX&TTCGZf=D3#|w6y-B(HP
zn>M~T9?BlhXVc4EiV@*XqEg`3l*H4%sHOS~Vl`azHs7R6x!v!*Oz1XFDzw~@)Wena
z1&!}nuJ*2sp_q=k&&gx`VSIe@vb5;|mS(6FHR4^4j1mhKq>GHO7{=t+y0<^%FZi7|
z1RN^&rU_~Xt}y=SSza1Rn|xAY-5LBf!|gz$1D9@&>8Ea$6+&V5V9GpOzj%*7R`&EB
zWCL>N2h!OMSixy)pAH~{rQ>W1F%P<<wo&hd{s2J3y1lA9>_(WQ#CQ5Z9mra&+K~ZP
zMh`1_)@k9My$>Y{r@_+OeNnE8L#vkvk-yhGU>Xe7NY5p|Z}lKvv(39azo*R6_+8cg
zwF4XAYd95N3hL5LcnYI-5c~da-qO4tcG{jWjEUmS?AUQ}HKl~8P%@yfGJai1F-Q`4
z;OqSyk}gwZoqIj6wf}l_B6i-HoQr3%sCU|QATDL`K5fLz=Nh7PP}e0i4?DiuH&W@6
zYk|n(#P|JnX7Z!AkPt@75sjru?)>qhjV3TW<a^UK`jzRk8<KiUY~vvI0Mo}38@z^f
z^SxdW4=s$is*0UrFrr()5>EKzyH5Gx=N<YfogCpIy!lg82W)XJ8n#h=yOK3*^ORp-
zYZqr9$ynBm#|Q4mRgK!*(mlL0MzI@aQa7cdCVuo-Pcuy7dEw1WF8`7^j$y^`UEewE
zE+2Dl8#2C1GId4cmpDE@%e&VcTY&4_-fgfUcXMQWj;A=;<h!PLp@6%$Al0l8YBop2
zt8lxV(D@+AbWTxE><a7?iITdpPGtUW$5x#sa&_naR`dr*E|9-bXm@}+U(kGceV`K)
z+Z2<8ui#>?<oos-(?Yuq<PNZS6G`WJ#L%h4AN;*wgq4S>-PUHw_M<``Qu!q!*ByX*
zJ~!eNK_Jq_QE=bcAHo8Ie6kaCgR_W4v_zR>TD&cZuc7F-R3`#DjmcdNC&iV$xp|i-
ztn#;9X}G@A!J6T==e{7KV{6~qKfyA(@tY$Bc-99gWKH(%_YAgy>ayqVsQ>D#)i9wQ
zJmGPszBoHEApV-pH6(zQITYV#Pa=|5DiW0;sXY)CzB6g)ZlONJ>JhPZ7v+2;>`oQ5
zSSlo%qT-oMW$)Z|djH%Zz1&+Y)0@0lNK@0ogSxIZS-}kYhhMyZ!q|H*zA*yJKlX41
z6%E;RxJBJt?+R0zFhX}{3eu@e=6mlDK~?CjYGU8+w<<R{ia5Q+-9o+5`GzqWYcO9b
zg6b9LiLm+(dR=U82*^Et$~#ADH`+0>7RVldMU3NU6^=_5#69SGGOijsy6aCD_1(^f
zu6Wwq<&{S1gd0#mAC>2+=8%OlBJmcvTX;dTpZ9}r)GAu;@aNPjS(ye`<avpL#p5YQ
zzbYD|6G{UvS#e4r!k0T|4|g>5+&9q&E)3h!`Pv;3IHU4MgtYyU=5}KiqD%|ihqXZl
zU9Bp--;+gqC&90b>7P=()`}c1(}=q9Cf(&7m{V2eGyZ0srcM26Dd_p@j4GkhF(45t
zF7GUamOBed`qmyu#?2r<4Z^jiMK(4gV&B!ueq03*&r`&jZVir(k|YCHBhGota_IJy
zwS<>kX{{~XqMU`rheaQgk!5-g`%AxEAP=Fpy=)2VCBXs2H_OIBI(9LSH;FG)wBgx=
zN|b0ji<M>^fzE0PF36Qt3yXDj%xej@WuaD_k!d^yfPM)oiVkT30F`e9jqtX^0ddw@
zN72UMq*$3&9Dm6`*i`<_=NJ{+*1lh~YLvmN=+fR8n;2svSysF4rgT}77+K(Cm3a!=
z6-QL-vd_kU5@HF==TRK%r-h35nXiHfQBay=AAa&sx?v}&PYhU42+So^)t@ZZaYUa3
z<`)5cx+#<*Y4<B#2B%QYk+sKnCa8)?lA2A)>_$|FXY&{<|JwJey<J|OLjRllzqVfw
zj{@TowvySi{;aacOI^_E`qD-`EcC}|!SxSUTX3p{9`wvvXb^~dmm)S~K+M_B4{GMk
zgTAZ)+J%cMp#T|D5w1p&E3@}_SE&vDHUa3gZJ$BbYz)Y3wL~WQm3280>ZSUAD<Tgu
z(>s%88hFy)388t(q|C>+;Ek(^K~l<m=&^|3N2G9}Ai&k_ib1Pl%+*qBTV{5jl>x^E
zZN@KWB~<f`)q$mfi#^JyIGaKuhW0xw?ej{{MPNjvj<V%pZGm2^4C6^+RM(e7-aR%w
zAX_+&?{DF>f~;47=oNy-Pa6=`hctacc1oYj4HkHA?TkpQb*#(znIcv&;wNXv04d4S
z(s29AaubAmjygvz##3UrT_B4d`H3D{MggS}%>0z|jFw{gt{Z;*1BQRz7#|IMy7NmN
zVvARil(1c<O9RolPYW9ZXQa{oO<d22(i*vbpt&}2HvDUi=it_ovFu8L(wtF4jTXtE
z4{vUk0ty-^bH)DWp_a2>Z3-aVO~y)(BGA&IVW5FDY_=5UmhjkDuPS5@9|IZ@a5^e1
zaA^aFSzsjlZMESDJ5lRxNMZ)#*}wRJLmRg?E1+1M+IPI2TcN)ZJkk9)M~ipv`_<-}
z!00w3Ze3vKlFKI4T0$7*R8?r-l}tN#pDpZLmm~HMg-=pcjn<go?d!<j_uu7x(nI6k
zlV&Go7I{^C;$WuqQ6PA|i5_B-_qtyyeXmZxey8XnDr&bNcY;kZ0S|_Jp$awCl|0Rj
z;_a*g?j*(YA4^@D$CfEwo%z^vj0Y3<Nq|rvz7hr<EF26K;TyX^UO|H|nH#lF+@gqr
zbUbK>53WCZPWz0_(wezT>PZp}t*87790-_7pt+-H%l1+<qSGJD*)i~^A2_dm*#9|n
z>`1Hd$+_<3A%No=ME&VB<_>u;7dfXvwjWuIZK&0o`2<$YG43-qPwaH$!4rTjZaJp6
zNOY8Rv?t84SL-`V>SG$SaJg5+CDXh&+|$7%nxnSZ+2c24>*S%9AvuRQ$V7)E8d*y3
zTa#^TGr3<CyJsj^ze%sREr?@dMO!{9W5BHRt-NR|75aKw7*A?yl;9@)v;xwTLC9$1
zI2Tsr6l>@e6`8+-UMTz!U%(Rz7Hr{CMD#|AI{;~*Rl#atp*O%UnTQh!h|tBB+&es-
zS3q5jUzyz+flhmRb$e%KE7+%eZn*2-E0jFPxc+3rlv|eFa|{;?ppjdHXwf9SGJ%j5
zIB}u)<@5FM%^rS|sAUM*(eKufA0$a3iLc-k{^WASLR;wK#X0J$nLXhn*DCYPnIg&~
z`#w#<z~uCc>jgs_k1D+5yMWxX;db~yEZZ8mb;|-`u_;FuI};WIpk3gN2Bw!^b%iD>
zN1W5~Mh-lkQbk49H0Zbp4zlHReA4^*3}pvuL#_R4d*mYtJF6_ONUndJi~^~?>cgTO
z35lw{RB4$@mmKDKKr$#sXa8fj_d^zGEzyC~r=x$jpSHOzQa3nKSWa7wKd&Uo{Aq-N
z9<s7|93M@W3t*8T4`>mm4l}RDV~I?9MJ!eGCX-c@=P%zO{Ye~zQ+_vp+`0H+V5$?4
z|74N;-r@(o3%IEN-X;G@lCu9iQU$1sdE!DcQ8lrlJlQ{4DVS&Xsin9Xs)V*8Jg5?v
zxWLlU*D89`Tzf5KWsZ$Vnlout%slAbg*5QPs<l1hu<-em#$!~*E%P!gD;$8$6+6zD
zSdK*A1!*SH_NY<N0U8O!!|9}+OgpQ6z9|V=Y`Fe8JqRIz5bYS3EBi7~Py+vs-kF<M
zKqZm#RkAI+MD>v#5A6Z1ao_L{{_B^BbdlhgVlHO96A>toAw+*FDl>qdq<(8xy)b`Z
zdVFZ+`zLb?%j~9Y2*vfnrg;e9f-nVi3c?zG=@5M}<mzWnYqpw4&$1`rmmv>|T8pb(
zDg*NR7Oj)SL5Rg?nQa-SYEt8XOS>~6`SQRkxwx>xM1wbp#f}%0R;SMX+sRcN0?h83
z@9ZoWOh-$L8iIRJ)Kghigh>%`yp%+B*QiBa+Kvz3&dU>n=zS!WNTXc}yf>++JCxiQ
z&8lPki)q9BC$~kFxf#0_pzvsOq$TL0K^J_t7EgI44VQ#M8Ec1YZEIr#k;M(5p5#we
zH3MqOVum<Vq{YoLVmXN`aDdAfcEl{x;)LNZE~~EN5f4`q&9KalHu&a~+PhA{a2wj?
zJuDn3G6)<Rs*se2Yqh$%>~0Uyz9rr-;)KBB>Q8Edc&cL6&}PS9;XnsG8h{JpqXIH*
zc+SLn{_>9TYI(-)jm*5dQG?n0T*^DxTV&%p8bt$4Spf`V_Re3M%DoNdIm&WkFv+h1
zL7_d6)KvxUlY!Xrf7?}5idwtZerAhxmi<hr%0SdHmS4>$^3%FwSe%<R%gOV8;j%g3
znRWm<=I<tb;^k47G@iLwacI`4s_q_!G^4qAuSuGw%X<#v@0u46{907Jv&Do?QE9w7
z9C&RG!A^bRa!VhdqCa9yrX8M;pZk@uk6&)sv&tsdS*AE9Ad6p6O>6DQ((tjY7O$OK
zP*}G+*Uf?_aR_%XKi)gP)U}s5QY))XNUshLtyg<1f>T>+;Fe;^I__?~&&x{}LiM({
zR%V8)n36Udo674oRVy5D+Iop%?}yC$($p}KIwEcZuihg<1|l=tFxo$_7GdIxcTSAV
zb)H0-FE1xEJNTikQx$dIv7_2R{AW+$RyldKL+RusRj;G6t~z2Yf&n&SQinOEqr?0p
zyCKgXK9AC^-fPRcMhkZ!IP)BN)Np0S&k7TZszU|7uC0`e#bFj#0eC;Ho(02FP4_wK
zVmJ<$MPcLZ45XUPhDrZ8t`@$L{@IGcvx~;OBhS4@FV|Uxu*SA)c;wq;f}HTdO2Z>l
zbPJE7b9QT%pO@B>r=l6%5F$n3GUR!p+RmwoXaRovXSJ5p=po1nuWNd1OUJFQz;EM?
zX4x=#mMaxZ%rHMlbjOcq-wbm4jAq5%(l{M>soH~rs;pc-V*tso=SvVzqd^nQwa!QN
z9}D$1Ose33!oXxMDz{_}UZ|^)p?d=Is&Cw)Q{mwg`~b>;2G@AZg@Fe$W9Ju5XV0CK
zd%ddj+n%U&#Y{xeCiVUdjYkL18`7dATcK|n2Ng)s`JJ^Zt<&nvB5>(6H$x1$M`5Mq
zeIH8E=vkI*l9%W05dq~@G2Us@H7zH8QGJ1e_ly8yt9at0HM_%FINX`q6`P_o7nZ>t
zNOUwM&1#FPk(a(yef`qxjwP~*)ChuohT63SM`CidPOTOE5SW%PpSCVmzS(4YSj&#j
zlWB-4A{;32U%;MI8x3J&h>lsmY)9{`eshrNkQ6q8eFFA)i-OX<m+9#j?VyBq$=5G0
zZcXBpBZX0MactgoiK@=Fed)xnLG+(Mo@u-94#>2chx0u2pw+=;trU*&jgfO%c7u;>
zB;Oii`N%Y(F+x*$ix87`)V;Z`jI+q*1YHe{>Gh~|qS2`3?jJ$Nu(Fmw=FuwFSx?Iy
z|J-TY)c5tzB3S8k*)ET$4Ggy)4nD@Ze)e%k$^}uy^)99n_Qr5UG<yxWCcW-P&)1(!
zA~sgdC`+FS8`-AT;3-FUoN$KPG^+yU(SG4E{8Y9gC6yPFzcpIlXwI<m%T49&H&>1X
zoIZTJsorQ0Bbi+z(a49{5DAIM&*g0}ky{&sdKbo;LIgKp4;DyuN36Hj^)(=P^Lpjb
zEb$hi?{!<2g+>~N1^HV3pm!i594XkU9g^7?1fQU(q0L0)UPLfvb=4PvW>!G*`(XHg
zLhJ#o8-omj>R%FX8IbDWE5x{udIOzC&_q6@XO9xtp5FN++j(JrA`MyM=gw7WJ>SZ4
z6uc@nYV*cl22yw<HqcdPh_ScwTO3<w?nhN?N&GCYSl;@yjyLC>O?N{@eV92vAUXOw
zRZ#AZDBbL&Sj8h}*r~7rBDGiBP)M4RtcLi&d1p<h9ixKC^2tKRHve&ji#&qQvb?-L
zs^}SWWOO%wer%ur&1wa-L8QaQ-!I__!##ut6VX|5#vHcwHCik@6fk;eiTskO@k}Hd
zo!~eSc}!pFbMt*e1wF?wBNxhCJf)a4O?HE{#9IJ|Q+lnzw!Pn-LXW0aReRj<QkuHi
zs3NIn;w&*v8PzB;=j-&doH1c}6wE9@EEIQ<#ci3<EOuuxrXp)cGu<&0P5jjXD+NXt
zY(lrRSsh-&GnfdqbW6Brz1vfp7+OPJHI>yxW0gs9b(-s3=oXdYpY@C_=U!7(<hT5j
zjAEj==nNRLnaLguK8^37V}r>f*CF5NkvvT{12+u#cc0BF7$iX$QlkQwM3mpfB%}2?
zOlNVCHd3cqgylWHr$QU{x|6z6Dg~Mfe4vQ~q&D{xJLkDH?OFn^d~w_AbWf&V`qIsx
ziG6Exp)ja1o*c0w@Rp`*B~Sn8y`Q6pK0mUI*c7haH2;1ngBM+WVX_@Q5L<m_x?<0#
zERgD%giK$(y<jF-#}MhsRMp;aI}dpWW7OBHPsQ7j+%Qw1HdNEpF15mi@(VjsZJt(;
zGM_v%={CLn<lpBGN%8#7#uwS=fYGQ#%`ELTc-E5W49;+O=>NFlhD>m9+Dv;5CqrBo
zg>Lv_)Igef(3cstWK;J^W8=4PnogY=p{zn-{x$_!F_!c<4A0ONL`TydnIRV}GnO3}
zOMPOo;lZV9rqsSHtz-^MY-vNRrAukrFT;hR;ZMnQ6cQ`SR^x_Fp0T2Tr+>{?0Q)Rg
zlb_NdapYTCxqSCw#oQ<C0W6hjofc>ABo?sWaCd)R1wj9aq#1sDLK>-?$y+yl8%_L=
z_8t`J-ZoHvH$5@hj*X0~a>N6J0JXkCDlN_sHS)ng*O=L~@eO+$-R*E)j)Zi*(P!vb
zX+jFKS3c(B)z2wRV=w(&{8|uVCkr)B3smyf>+BvTE-eNP_<*Q-9sBAFPSe;9Y4!>V
z>LVU8cRe*c`;^q~xD=|SSb|O;rB-R%Z4=@PYsp{5O!&3W=yI(Sxr%FB4cFWq&2B^z
z2y{zdPSk2RW6+M86~E6!$tfo)YDVnbvFnTR<dSZn&?W|QgDorH<>OXBJj|zM-pf2*
zkJSkse+K|&jvh=ROySd7-`zI`+d2=>Ls|i~d*N5kN`?i?{XaK4%~;S}Xr4JJBFU@B
z_jxe1gi!zmmS1$ju1d!}=l$uQagZA)DPkPXnU~1;w}cwnZkWXOpEE97@BGCgiVGRf
z9*@QZ<t+?dN!!qdACDNWUnpFm%aX0%s3LTE`+FW#*&hkH6WjV8sXl%h5Hu(v%IKpd
z5Wu;C-6<5Q0kqS^;A!jr(eG`!sIoF4lndKO!M^6=h7YQ?5~R$;`~(;hpR6|O4wFnF
z94;Aq-~4Fw9+7^jnT!iS8r@Y2PwNP#OR4)X1#Xf0B`1i?Wx~uk#x$dliGb|YJ4}Uw
z&Y^fAa8XJ_@{@2Tm#ULf_WcV55AYRcUuZ+G>#sB6crPJOoaj6Cc2ZkcJlR!zIHF;R
z`3wSZluz41CSw07I`QOzu&4`gR{6`YVYTw?lFUNqwENwoA-;=>;oFrOuCsZ)H$rH)
z6{UWI{U6=ukN~W1{U0QyaJhkat%8Q*ht0Y!+G}I+AH>xsAh~dYm&4s-u1H6`xA2=M
zndEOT#4|cF-)r*{aS_;i*vrV9&N(W_QhFaR5lL4ma~kF~|7{<je{Ti<x$u0u-<JqS
z))FL+TWWHP_2R3Q0=_(N>{w?Vf(!@dfxc0gqr;2OTswthx-Wmu@RiqRbZjuGIb-5Q
zXMM^BSK#O+(}5A^DnKJ~8&dO0XEJ8V>0P}(`(;{TmGhmaO{|bb%Uv>1iti?qm14Z&
z7ModLF2!$N^6IPJ&%Rv7Xz#-SJ!V5_TCb=fkvi~y?*c?=&rZ@b65tFlgFbXNs`kVM
z!fTX7-IaX^-JM)GkY(fw^Gh0#oNq8=ifVkFrS_88BoE<<{iaARd{YKdX2-xzd%Z5z
zu%kYG?TM)r@Rs0D1W!qA?mA6;lhyw>#!68$*l^q!JVY6f)?vyv8raj=+ED$25AWbA
zY^lRZbM&Fj!}!{$3JU{kTNTYSbGaI1G~0`nUH(P&nLcfwvS@sDFmdwUwgJVUUOj!F
zbKpK9?vh$ecaWm`RGdM`NN%RVPNR9c9T3-a=$SWvN8)hD-L|T;L4+|mNz#YF>}0Lq
zTM-t8O6xb@VLgo7nSXITjn64GTwSJi!^AXb9iuaEx1Kz#Z$_rAzQ-0OMKiFKKWrd!
z%NJ`d!aTFFDzR%d7XEc$Qk;lg?!z5Wn`f%5ek~Wf?XXX$)7asVTZ&5BaE~(VuuR!D
z5_{;NN6&hSRr<ga_eyxXC6Z8<^~S8!A{#Cm6BaLU#s-i{K0CD!ZXH^BIBmkRVan3n
zZ0{kf+|+<5;zOz}7F}_bGQOH24z}5$u2~a)C>645D|ZB5z_V!tfS1?i`H59yzT9xm
z3gene!i8jraCG8bz<LSM$x>@F_HK~sn5d^^NVi-QpB-qGFnk6ir1GN(?^P%u-ZCe{
zFanEfEEgf}gmEhSskQ*pLn<D`)^SEFN&(O^a@du4|MDJk2;;Lt^m~m5o7Y&99f|05
z74bl*t^JSrR_<ABBetYB2gp5Yo}mxy)+E|$Fov!#m->K4iRA~-shY`n+#b&4?A-g#
zR&RQs5RuolWbcMlVI13A?XYto2;iZtsQAlS<)DG6*E+d6Yw7htxj%$WLJYquW>M!-
zwIc)}4X3k~W~KA9HE;2A0tS)h=nc-!?M$23S<eezF-J@0!e~zj;_E_ZT}r1h42#pS
z3yBrU54wVhFYoGsS<bJvq(lUotB0Lm$Upmj{~Sf3qvc;)%kn$fYo7GXgq7@!Xv)D!
zedeO?mZiJsBg?s6i|!u*68|FQchAVK?3q~v^^nX_iZ}!G5}~dj|LCMmKbCdPAZ>DP
zV0&+ZZ@#1CKm-g6A<lVX{~K7`{K3KZlHs!T^^T?6zlHWs&w@=;1rdu%g$1kU-UI*W
zQbbIdSmTfH7GxK7rq6>HL6%Yc>+8D(pyEdzO7Nr8{AshyLrzYa`C}9#{6CX@XC*Gk
z`tq;0kC{tRW+tHpQ2za(h(-KE@XvFWE^7AQag&8#@+AKm|Jh0&i~C>tWww~v|BU-i
zRfOaT|7RY5Cn-|4|8V$TXHH!F?LSlHXrcWtEK>ikSPWNcc^CfB(YiIq`Y&HEg^yGF
zeObt)cH@`)OeVGFfB6<lNCMIRU-0_>U^)MfS%9U;*!I{b&>n^h+g{I=*Vj>cCwB2i
zPC#e!uPlmoXTGR+3S5A@VeI*H72bXIe1jKHZfonsiFp$0I+;RBnF?_sb+`3gJ65oU
zJ4r|*J0S_!bXM)gvpBN)K9{8^g7#VQ0SsNIg6cwd507h)a8!ZTXv9v1@m?bTZz<u?
z>P*QK2G6kn^2~^s*@bC)KZjq};e+@<spW#+5$T(<8lGD<xg0avbXas}ikcjHeQ>PX
zdO{C&MM3?4KFP-R9*wPvU|o#AC5}Lad`9GdV<ql*>bP}hY&ld|tNgdb{f^&VpzoT=
z*;#&~VYDDr;I+IZQX5U-u%}e{^Y_uq7aw@XG*1}dZI!>;qT<uCn%IS9#L^H1T_l`f
z7Y>CJU~=Lh>JEznF#lc=LH}!xLO+=1G)%<vTfdOA+zO)JGe4b5<!%mrQJ3p3J|-*&
za9LTMe(&!00be{0%WI4VM2UVGaPAwZm8Sxei^ZwifX8mKik&lvI0LC{7dk+CpwT{8
zkVYOg>jS6J_RaHyf|!A#<aP)*SDGvOqsMW;>4P!p$fEuBeA3sR1<%)TNbL}xpg6T;
zje7&BrJ?C-9v0s>=pkq&Muux{T`<WKFR1csxm!PdbdH~Z<K6ayiY)KtoyuTAF&6I*
z##s=PvHr#p>72WF4wXO@{}K`2sHI2y9eq=fY%NfHs6{)3hcdo6k`sS5>!&izVb$K!
zCXVFUOtqY8iJWIzpO5*RtRt3E3=)kwCyO>hf9JW|sH0^?)y;*9tcqyx^DZ0@oSQ=C
z^5MnwO|oT@_-c0~t()i93Tl?>Y=yXn2R!-FIunH1UDK`!-e^M!;Z68#Y4euKe~95`
zF|J?V)=tOh^U6B#1hxXt_Sl!LZyM4$)4B*{Q1xXqJp<cC;&<o0XO$-wqlX_`arK^V
zA+{MX|0FDJTIsCSX-D)AGaS19+dRA@d69@oQNcWr4t+p6nVhX#^kAP=;!7w>>i*-W
zC0e`0Y(5;pSr`ocgIl3oo`K9PaE;5;*Hf;;p>?9(?yuIEh-Zri_WeTg+QI1&X#t__
zM5NKHj)avRtgxl4)~cP{1anox9}OsjUnc@D$tt%^wWRLf7=qWVCqW&X5g6I+t{K<;
zF=dBjKCu!WnVRE^FflIbe)}WZ4de%#EQJ@nc_bHl@YGQ$WnL2)tO+KxmR0H~!N;8y
zXjQc6z%M58aN$WH6j=?aLgU`#wL%h;3bco<9}~A>26_txO8IlkD2zbV=6a;LZfred
zoxy{Ej$MNF$vG<)HKtLVV(Ia%!`!@Eovy`qhG=s<yU!k$fDVG$^kA;QWoyFHAnx9F
ziPP2UAT%xFl*6xzdzU~0Vzq`o9MkqxwCQ5e0=nM`tsO{zw&i8%vMvuzg{R4oG*5?S
z7mRmI)x^Y4lh9>8_ixPKw%la4O3XJ~(Q0f*5o9~knq9Mx8!x1U28<eWXmUrtYvy%{
z+h;UcF5sXtZBGYumV&}Ob0DeD6}TzG=UB*!kor-~7q#{2R%#4M8_5QC^OJ%lWVh-!
zF)&*sC_8JC%3C|S@5YClv?YiEYk^gUIgg9SLlSSk=FQKRY2h~l1<wip((<3+OFS{N
zPYxz0>S}!x7&6AMKe#!U{|uK4Bdk921#P!JgoHK!(W!vV)1vk%5R)n-v}#g=bm%U&
z7a<&4tAey~=#tL;Q>Hx#l@H%4fN)s{dl-BW46$h0I#KRs?eR&o?dl?*!H*owwM2$n
z*XkViRkjcUEY8Z4I=1@<+gqu}%ZWL@OBd>@&dgP{U=-P*s(-HE24C5(O%ut6O$AOX
z4d7X5Ww$&333a!_F3;VbYHG>uPMGR=Ro|mxmZ7j5Vv@S6WqO2+Q<EwNjde^-=ez>y
z>Vc#8$UI*X|BHi-ad6gFYb<_6k;Im$?~lo;>+A{y<YQM>9QoxKX~i3UI90{m>BFh{
z#+vgT=5Gk7rIDiMC9bl_%Jp+5W{#F=9VQIM!J0agR#r$R9Shc#AJx?jb%rNQ1yOiP
zA073YPDZUCEI#oKgPYypcq)slZZy23=)9q^urwG+*^G96{p%deApl2^vfVd121o#l
zy9=jhQr)Q)Tj-%V1;ZMfxIZ0!eTOO(U{M3icZ|DRFngBv#;N4y4ayP|!|LrmD|cpT
zlc-+p^D8DPh()eRn@vIXfZ4QrB%d4u#v7E!q3A?4+&cwJ1y^!@<eUUw)HO?9%vs>n
zT4y%?BsMnd`|2U7q+ty6)=qElwxP{j$5jS9ZV-r1$=rU<;AVZSOY1%BPT8?RXn3)q
zlpVLpWX{|noLN-o=pKP%IqCx-XTQl~B4=wk-RTSYkH}o(xq7Pk9t`~u{flhNxMFkA
z{Ih?nkA{)e+gQr*YoH@9y4}dkq)ol#dq_js?U%|W#FdtGX(tSRwxt75_Ua5CG6|dI
z-cf|R6?IT8;sx()y!^Vw;GXj}Wksz3p95rVkkc>6QWP%H!&!k|zY;)vX9uLCj~6&K
z?MRb4c4ww=Pw2w0hB(xb-;>I;nA73<zE5g9R0i1O@QiBK%jV`PW;hDnQ_TQHpp=)7
zl><}vP!a7knhAWw7DFWfGf(ja?ONY_H7qM=D^r8aZkA3}pGa@6Rm1)!zIYi}h*7YR
zpe`P5#)=F&t9aV5OiY-lx6)VTa0Y~Fr*{9EOcWc8vsy9H;Dex%vEFrNh>I-+-rE^+
zEgXhfBilUKh?KWUY9sQKEj6Cw&6}C0pU8qhA*B!M<c9w-uoYe;GG_e8e1ZKr^7n*b
z-^R{)p8!NcKR-1<cln&oevKO-Ik-muG~_Lw1oaCl9tp!~W6WG__pf-;YKmh8R__F`
zEU}T>hj+fi;eUxmz35IFSg-y?UIH%Dv*JqPhHFIW2!9a-Nyk-3(86@*%BiZn10+p&
zlxd7R<_%OaPND3q_rg04b6;|}<lDPLdISQYKr4wtwz24THX>h-W6elCA=R*VVZ9G#
z84Cz``X-R&1<1QgL_TfhK-rqL*}<uhPh^a(^SY~oq~QqgFH-<PAr*BVzo)wsLhSuq
zt1eK|0KSUE-FH#rUTs|?1W`yED6axY8KmBIB+~7$OrFR5Gbe6e_44Kj{{V-&?QijW
zUQ=Js{tm!8#O>4_d^)jS`3^&xh}47(Z*YGpY&+|rW4YWA(%^v_Dfk)G;rr;hqNV{Q
zZ_X)jL-9T7lOP3}lQFQG$Rl)i0>YI6?$sTb9CkzRa!a{;ONH?^Q}+0#39r^+=h4aq
zRD1fjFZ$M>SzP))2qb*deS8nf;+eU80OER@KdvwSFY$`4?rrI`7R7nHgi9M|5<f}E
zSSQW+gZmS!`*#{6952S4pJJ<n#SZSxd&u}amP_EqxWDcY2yiE86653g7Ryju)>4?j
zQQCj`<CpP!rC~NsPOxv?ER#Gpj9F7G9tH0dlRPsn;{_Y9MwzppI|-D}*IhIve^BqE
zX5{f_Ko@44hCfkIuzsrGTQ}(pp*XI=kVp802dR_%9rfQHzduNS`+vIcouP$ydBUq8
z%>Uo~?f&KZi;k7cL6-4O&w*W{f=>7SX&@nwi+8zHp^j=FfU>1;ViJ|p;f-6G5ZE1K
zzY*tj#Xa#BmDu5@)pWm5o*s>)A8lTXUvUi2y&?0bDvQX|;_a#5B1e0Ci%#b<cZIR-
zOo~oXN<BR0`hD5(z1>V9?G-gYeZ3)rqy^$-x_#Ycg}sn#U|Fl#1w3l{4Sv6ar6krI
ztER}=zPq5``|@#?LjUoy1T7cL%_E=|KC>>m4b*?fdH~+_AEg-W&3*jaYdi5^UI~U#
z9d$Q?<6DCcanbF~VOD97bK3R9xl60VOFkYff4SriMKQNldA4cmD51N?0xI8T`>s0d
zYq8wgzE|(_UC8b|{=_HaDI@aB0d~gu1sH@ljS^wUEo@L2uqS55G`+(i?OJ;6aHc#i
z#nH<rmbIZHtt+H-<7Uk{i6g}>AgivzW%w)_GJ9t^Gs=}gPO5Ti-GR-)k9<%*Yp{D<
z$5w$4oq>C%)l7nbca}8LOKs(arHFy_Qg;>S4IGFWCD2}FZm;ude4hjgFtU_O=v_bT
ztbJI0*j;(^fvAX0EPYX!Ot1`#Am8~Nc3mvmsvcmD+zOH=V)^M#=EEL&h|SrQmXr0R
zr?T|VHADv2(M^DHCev4w2!$}?Ow-$of)l{o#L(v{Bu-@AEuFwmTovbU-sn*<w0P@`
zZc1?o16g8f?l-G#Ru0F{3^YeGGnu3G$-0x#M=Y1sjSwY7`t5}dW+k1Y`3eh9y^JvX
z*F`#UiiD!SD4QHNv((qO!`wr-7GiL2v<+iQp)JLbaGbN|=N)&o#3@Gq5)p}*ET7AC
z{ULEm#sBMt+k3119Hp3~#8c(j1}&L^Ds$?*mQ3(O-4Th(>P-Lbp+BilLi>m2l|yVZ
zW?B0ejc?(`aM{~jwsK(3(<|y)cL+A>iuLnAohVIu!=uZqCz!%Aii%vFMSzSJ)rRqE
zzx{ow|DF5TvkBbmN}G|Tt;R6fu&grO!{G2N_4};-o<eX$EWWhy@;1w?6e+gvCBiEp
zHWlD<seo&{@y72Y$L2ICZFD7Red<6R>#WtYGu{lXUMwcY@k{)hD3_=W5mQ`McZ5}c
z=|}KK3GGly+E^65dTkYs7^~p1$JHzpz;nnH*l&-%2obM&u-U<)NjL4&2eFa;yMX3)
z<u4``=i?ntg|cXsy#v2G0wE%pvU#HWmC8WL8%ZM*F|&J?j~uDUcFD3-E`z5lvlQcd
zfb=z!zs!lmPFmvLlJTPZFEJ}bQOmt>#FX&P(qAA=IGQWlfa_zkQ1R}A<|WCy)6a^z
z;+z9M;Xmsfx$=0SKBuvvEc_ExfkGzU?u)E3<dx+kP8Jz^f&iWlIZsp(yo2;{DiwmI
zC&80hy~$^}ovM=(uYtS=N+!nw+k~Qh2vD}vqg$76EC%R@*?x3hGsY})XlRBX-E|1Y
z7X&@2RFH{w9PT|?>*uCOyf3j{Y~v0u^7<2Q2TRG5R{w(0oiE8Py0Ll6)lF+*F*wsX
zl6&%7=S<8UI1mxZiA5vtd#nI!ggq@-ZFdy3a~~0Ejo~z;N(XyQ8^)X_cTJLcuX5Gp
zQb8-DRu;b$Jr!tUyu9^+YJqbTSet{GMm@>byS|Gb>cCYMi-yjHvoV2q7xw|$>1&pj
z2xDOMa!!6JbE!z{c$=}Ks)XZy$QesMidroZWBeoi9_WXqh;Hs^l7B9l!3_mGz-(o&
z+(wlnD*5bT4CqMYs(J7nIlw@3c_)KWzH>c&-Gr8oLBQ5X{8zB2em<PPzlR7p^+#Gp
z&pa@6oBB)UBwN%tAij|=D&G2Ho8?vM<*PF&KhpEcc@|Xu5rcW`eOe9kB3*Dp*~(du
z#|xXG*0<f7DEHH`*)*bB_Qkf#0N?cZ{@BG`_H+{Ry-HT&^OZJQp6p%mDf^1CtZkRa
zg^}cS>>HI<a(1VaB{K3P?qMv|wIBx?Q|1js0c>%-JD$V$!-MK5^cPcB5ZT4pxW+w0
z>&~IuO(^{;g6sSSIg(0!xNP2bSodLCh&?%+675d3VUF!IK2E6dMM7a|V1r#-I?bKF
zF~gT-3$i7xHnmxzKNY*#cUNyhSY9&9A`##`XhQRJW)@{kst_4b0VM{3zui9GDYait
zeL`-!ay!bz*C(4>6HFckW%=}w1Ol#3tC94TqFJ6Pz*&5>=5ZWnq=T;=*VWHqS66>h
z>lPWB_NK9<oak@UdU~cSo_mhMqee}cR+<eqlB83`>-HX8H}UxwPkoxi$M7pFg2Q_U
zMLXntFl`6`aRIq!MD;=!Cw&CN0tyA&H&LNDAASFb-06P{Wp|wR7wSK@*}<0<Dzb7-
zrg1wf<vs&KpcYDUj_FH?36$sUjM^|mLA}_;Tm#Pv1XqotJ7zg7`dHxtGn9Z4>+~OQ
z@$i<OWX;X_hqRV1P7|fj5Q@bdkB!z%sVbrKwx72U1~%0a2PQgo1hjtqIF2K_#OICr
zE(UKqXy@`h;d}l0Xn3=JG`m}<(KXNShAgFAE>G{rumBT7K+_6dtFCfiNYpkJf>*+l
z=M=EteRB0KCgRAp?$_|T=h5!`3FV}cf~1J%J4f2wObgBh#Tj!q<{QDQbSy4G|K7bn
zofiznI0oN4V}e9(JI=~u$-5Eta6XjkRSQu1ZZ3WSyV+euF?8;xR_*Ys%6mt`Bf^^M
zw{Ig3cEO<cucl@&Xssoj)nIJ1PS-NR^REx|)%Jsg(hea8e@atXK<}`3wKuBl%Q&Vf
zu}w0QhmBg5`@eot$C@Ij?$LjQ_`mw|v(*U~pSR{TDV})3-DyyPGz5DXtYRjV&A&8#
z;VWS(tERMmJ|(nUtlc42SW62dWyfrC8p3vD2$X(ocID({P^aX;-7?et%W6{psPXRw
z0g?MRY$PL>S`!pm_blcG99mp!zx#cr`3o6}No(J?^Jbrf_H^4EL7Nj=fe!@kZt$-h
zI6SBJ8hiSH>|-}Dh%^ZH47pB^=h7Dw{EwFyl{<Gs6&1u>gChThE_6?by3c7c=^ztu
zZF8AGh81$^3sDs|_D6eI2EYA;p``c!@hwNPv5a)@<}Ce<>C<F)e!)r^Qz`vpIHJB@
zRHUq9z$B1kk_^P@Gf8a&%-vOwVDCuGZb|i9tG%(Qhs>zn<8EAaFuyy3mxZdx6--$Z
z^c8mXeqXM%7IdxU2$rcZI`tLhDHpIeWYCDSzf74?0p%=qM-(GK%DpZ)LM8@mQ$Tz;
zsv-X;gk?OpP55VFCRB2}1+z3NfvbpK@SAeLMj5%gb%v#mN4i4OJ=){FoPGHaVPh)3
z)e-t*d{L7j6~)vyN^_IR#62Nrf@2A)hV-7APUzfLC+~NzzdV2yS?zHxJDL`zUsi6x
zW8muO>^+2JOGMl~2)&4V41HUsq^=_toTr&2bg4+vF*HIxIoMOle#v1nKK5y^*`w?z
z$On6sp^BItR|PRE1{jEbYODJCx6<=J6~{mMD+>lK4$g%Sc3X>1y1!7Co(%kH7PMhr
z{Tj^WL6C*@9P8`?RB0VI=%+Q#<6;ZSjf(vXg3#aET5KLt1LY7n3iK~>dTO8x#JPt^
zIA>fP|A27N1ZxK<bbe`aQ`w0x+f5z7(<#L)tskE$${i#q91Bm-loZ%wikq8k_ANk-
zATyePLW{=RRv~1Kh+*HRazDo~dVrYox)HL%FEqiIZLIH%PsL?ey<-pdI_@bELQCcQ
z!Q@6*4qGzKg=DzyqLQZ=i;C8}9^OHVr*69KGT61{f)o!^v&{fy`zQ69`is0n)cu^A
zz~U}`g_e!+(w`rTdm}@u&Nz;n*qT(tf?p;l?4T;x%TW{#Dq<z@n-*h=D<=nLYGUIT
z8K^R+D07&m*4w0?K5od}{-r=LMb;;Ynt`*oNy8hSCCuh~g1I6n_*M~}tt{v&$Cime
zoMNTX7h<V{J;Crc=6+7EcF<C75dw6Ix8o8InVD5lo0suz_=B4&g}~}g$T&U8x<T<R
z%}Q{wM%Kf^@gv39U9Y)3(RUYfWjmW;KD?2AA7apNgI8CgZ}HlI7gjhd^$?fyo>$n_
zb1`v2S{$=^z7r7#U3DH<>d3aI#M-{?sIsX63SaCr$9^cx;W0mT8~X=#@=W05UY=T-
z*8&xOflEItp(=$Y{V$~RA={4M-w2I4Jj>%lm@(cJwkH<91zuRRZW;u5CDRXYg|?ta
zl4f^W7wrjXVG+(YZe7yueLYGV*<y&LBxFJd1(?TGvMnvhIpY2DZ|yjBnuOhqYs)Hi
zp3=EhU)nLlZpuHm+f7xpO6c*su&3;B&uU7hd1f%@U(T0*zd%BAxt#}+MRRfeE~Lu)
zYGdBEGclpMrf9cqLGOIAwkKeU6?&5M2*y8!rRy40-T*jdkjcjPCZIa|3O5~}42-~X
zZ>APAKVq9R+}wR4uyv%}I}Ttmc-Q=8m4YNr4e^@MF1@yDQwv?yJuKJbS7JuVG-DJY
zc4`~5A8p4Rq~RP!S<xwDOEoo`_Jdbps41scC1x}q59zC#1u|+lgEe_ElTubic9c|a
zhBb|~HP6G*N|i0Pc}-=U)*pdZclO~=j<{8ozx;}a&AF_olok=yy$kC*-grd5c3G`B
zTFtrd%uxE&+c${hOMY}<)C#>&&aI{v(%+X2uN@qn39L&qtGlPR<3=_t6QW+QI<e_2
z{7FJ2aD&NKB8cUYIX4(nr@@{c{?SVOny*c}!utMQc=1kQQ@{NEflk4e+ScpEsez4<
zCH16W1^JVh4&)n2fIU0)G&RB%J}*RMeSS~sU)<N*>Eb_exwT%jFq+^dlJ;~7pNYg$
zc5%4_!X5j3U2e8bHAB_n>BXa6lQ8l|T)M!f(vQx<LCvxX;>`iv`bOirdMf<(8Q~W0
z9^@<h#@27YvwjXln0df8Y(XAM<7QpF`@nhEO+qJ10dD<u)J4dmIHi4ttN$u6eJb^T
zJvE^8>4n`k4e-KebtH7@X<drgnWaQ~?H?X<fk178rBRoDVs<4=m})N7{n`&Z$p;|a
zXhHCt9f(pXBnQJj@&Qo_kv<Vr*TRsHSm$~olTA@)qAPu`H3Zt^3xbH<u+{o!n<f4?
zv=R5@oqqlCf!Xe8Hismm%rV~dmV1_`NO<KiL9bPN{m}u|W?)>X=Uq_=YFvMbFiT)f
zlrz3--Te(3>1f4n%>0b!cV5=oN9D=H#Wd$ae?KN6MKRNK#lWB(XGR&R3EP}WwSbCR
zgm7skTHKC7m>Y^e#SOQZS!%2UBQvY|gW>(P3gtvc&k@ThA>4oO0_5sMuUazG7~SuQ
zS;ONhRYH*#6v~M1N=76L%$V*(oI|G=MClyTJQmrD@;V2XZ`_mV4P&on<~g1?93Dd;
zcp`ElrViIskFN(WU;tiMAl2uKkmh0-f*EkSPKVL3XSU$wvQ|Co(x=fJZXnaBpeBS@
zMlE>bHFb?o4b#abf0#6|G=qU;OPYUGMNTY@vYQk8=HnQh)W7TImWsW(MCIHPxH?sF
z2UTi{=x0bY)Upd?jMG_%7C*BpTY2J%N3=OoA2G!T2m8oyR)lf1>mx5|>n9VkLKmjF
ztEF5Sq}km#iMXo@WlGU!@C0rex@OEQnOP;QittUnUa_0d0#@X;Ppq0&3T#d&2;Ry&
z8h>+16Ln^ED~|D>oKZZSjQWraiuT4Wt$lBu^JM#LGgeYCs97<RJGO-PfOMJt#HCKK
zjD(0qN^q3{c-R6{BeP|#%|WErh19d_TEmh|z6K({X@|I5dlHy=DaI-(h@WGvh(Xn1
zaE*K{h{fjj2}K3r8?E}j^=+w+qDYs1go4q4xM}6LG8g;rC8NcpgH{x5CJ<Za$sEXa
z1$RYeaYnEGE^R5U3zLVgb%vdDbBFw99Q^u&8d-z&gY<2@f`QKZ=ITh)V0^BpfE2IE
z0)d^Kv$C$v1w-kJ6^TIQpt<anF_*HS@rg%d$M0NEpSZb?x_)?HP8;oS&f~l<I=WDv
zWGdO;>A`aY#Yq_L=zWmcZ?F%1j9w*|bLb%QS-cc_Ra>Q<zt8n?P(Kj)vu<JQtjA=Z
zZI0cl30ASV^I`cH3XU0A$i@bhd`;&GVssbRsRQourSDy<;(uVUavv$sZ|eN+7x1Qy
zr%!%Syc{D}Rq`<fvm4zr>k2s~CUu#fNW&ZZIOqOK3fl7nXbBacm8{s5D?D+)2mBvj
zGFdfqNLwA5!FyU8&D%Y0RH=|W!1cU3(^chH2yOYwFR~usVC8rgC1d=B5T0}I$hgU@
z=(FY0&Z;K^8Fwr0r1uXOF6X?L!q#}G@7=v3(yqeP3ttDdy(-Zo^mRNel=AY9Pa;y~
zXAO-;ewVeY7sH)p;MbhOGCF+})GrfX!%KeMj7TxB6`o~R6n%!-WM~YE)x0ht^rW~`
z-gJH78dkBhCCCF^jI3Tqsbs-^OoZ`?>stQK@^I<Y)H|V&{k*vte*&Nx{}1-vCcyn~
zsGIte;H!=_MJs%E>gf$o_WUS9WG5z9yuaRE>k|2PV6Fjo?hG8!V0`RSGTfTz*HW-F
z$ossl7Snr_#<iPCP<>2NI8(03eD1h|OgL~J381&eOQE^wJ!|>KQG>96L_m@&x?x>M
zw!mq7?Ka~EM>roB!@;&TK~NogIHicw$<K<(VrkY%2=(Yk9t6h+-%D^lIMp4|^-ZhE
z=`r~&1Cu{4P531R{1~as<5BgRofoV}kWE8&_7f?ijn_7=X?q5lKBL2bnb9I4NwHzQ
z6PL)olF9$BbF`a;)1+ws_@TK;TCJ<)eQ>>9M96VH>>Xs>QpgM+>}|`zKchQL>W|U|
zPcap$Q!n5}ZZ?NDrgE*Mes>^tTBB9pxLa3kYDvE_oBlgDFgJ5CeUmz`>k8GY54xrg
z+O6xeIMtV?<bvM>y+T%8f=0=&V0~EVbR1YRtnGt(Oo`+r@Y3XsE4@1qyf4#m9hPOW
zS8QoIW@?%@A@!aXofb0vYmkb;jiGLu^DycMXU3QTP?GM9y^d%K=6TxGY7la9>~2=;
zw&n%;IZWovk~Rt~$IVOBsyDwdv|?P|w@z@>+=k2h#Xe_CkuC%2;2?wOAu8)&(Lir}
zZ<h@^8W-3d^gtJ(n1WcQl{H*2elql|+y^c5=AfWt`Qzx_pytT^qjqlOZfuI9ZU(&$
z6QRArE){s(g_=#V^@c(FrCoF;Am)y9!mG<~s7Q3|&e#O+(_vw8|9El`Q45ap*!v;u
zslz<G9xm>O%_&W?oxKGfBzM>lq}4239X>mt`gX5ygtVsxj<_w8d%5Za%UKQ6u$`;6
zNTz$7NQGKBSvV1Vkx&6Um5!?7@=7c)<bDTcWl)^`5AxnJDvoIB`y~({5Zob1aDpYc
zLvVL@cXzko?(XjH?(QywyZhjBCr8%%Jm=ic@49Q<Z@Z^wrhE04-u0`0Rb{6Lo{ct8
zFF*)H*U+Z$dyg9q!FYrB!5wKPg=U;qY@M76yWC?>Q^L9}`8CEkf3`i#fL-0eh;lPB
zTuyL08I6>gM`RAmNcoEY{fAEfLyxEd&zkVYsrxb=*eW&@eP`Na!w@8`ee2fuA&%A>
z$o}Xjk<hxcYpUfndA|1z+(;_e(5_QnS?eZxxUYuqO++hMT|vgilYF4q&}h@ij67Of
z$h5Y|*bg~`fzUtn?)c^(&al|uW`EEQ8y~u7XJGi)j=<7$?QKgr104$z^nTh-Pf8M<
zujie)z2g$r_5JdPiujnW85GZmK#EBpAMbNLvq%*bN%oE2LLQ@liakiIezCw>L41zr
zdi~WU>v;Ji6KoaC#O>{SK~if?czNHn2oViA1y`DwlLyL5)SaviXA&MW<axbnKM(6O
zp+TWwT&#kq)+9cqKE|zIkcMLeD3H}%$$&m>sXu&U1-d^P!#i*X)9%ODOhJXmvu%j2
z=qnMIOEmN_xcrE5aRuUnC(=9W)zN!F;>pz?P<q5+k9`3G^nbu48D%3;#2m+l9i08?
zn7hi_6%QVt@P3dm#SO9Y&h+4hLiw21iHc|kMxIuw9C6mqsYC(Kq<dNNN?2pZCLi>M
zUzg~nCR<DyKEKF#``13Usf(s;dt5!SN06Sqo=-FA!onb08Yb8Zq;w^W7>QAgPULh$
zscB-tP_bO;oHbaQ&`h9pc|h4Gp&MG^q|v9d++!yh*R(%=fzB-RucAEal^@YcVtrq^
zF%KB5w!p6WiCb_a0o&pIl^B?VY)Q-*so+V<1biDOczM<pW<h_kXXTpD>62Tz?;t@0
z;FAl1jauaP0)Dsy1V&4%K^evGP;rVlqY|9lBp1-oJ44_(m?g2s2$+4KqFz2p>nCut
zCHQ|(Y>#LwXLm8AkmPk88vB#YYI8QSOqIE;Mx}y@n?t^6nH1F#5Mh=ZT9$*mY+Pb1
zmm7+1BRlcP``(v%`S%yZWgs88?t-O$k-G>?jTG4NT$PT<4kHn>|I;zW#Gv`9-phZD
zE=qCCGcDbRXOQE0C9N)q67LE<pj!sQ6tU2qCowp6h)<K-J<EJB*@@>N`9op@`Rj6p
z&5NFLdZ5cjl^beudwBWRfHQAeXv-~XIOlOYhJ&ZL`d=ZVJO?)dib#WduKkm?+);at
zS`b&Pktcb*Ca#`{9AoIec`;ej+%XF3=H+>?Eedze&4_w;gR$HTl!IZKEvIuz`@Eaq
z@!t0ubGj6TI)AzCW#x6+zp>w`3JXJj<YygGMfV$zD)dI~fZ@NzL`_kBp%Z1_<dV2n
zR<0ApN6DZcx8AR<V#z34l(-^A+QB$*BUV3gVjcjoO4)Ox$PR^I!+i|;j`qDiys;H<
zyHu2w_vq7sAD%$m=LfOgcgL_>?5b;rh3c0EZYA$c$D5fA$$=h?lbyy|8erHB`_<H+
z6#bu>I?vBQY8a?$eXrB+ZL|f~cf>3Y_k7p;Kty}!oSYn=lUHwJAxVyT^AF4j_WO?#
z?6(Y!5@Z)QMwMV`Nptie+JS1b1PK%WX41!-*|fB2Rn_j!xbx4G+Jgr$pu`IfF?0fN
zEq~Tqm?W0K{a^{PsZ`HSKde!)MEQ<jZtS%UpBW*cc)xZ2Dc@5!<%2-#=eqVn>$s<s
zWFL51i``r|_?ZL~@pw0qfJ5ky#P@};|4^^Ca(*}r{2xdDNb;!?X8$*vH?GDyat`x0
zTRNl_2-QGGec3->=0EY?F1M40EWWWZt~F&yUbCpIWg2a8D)H_=Sy@o{W5ua<P(u1y
zq1<>42q^j`K8+xod;&lVugI21zttIHk+bigmFg`qppR=jKroX&w{X|nJ@Yu*7JJi4
zTZ$9*&j?-L3d`WHjNaHxiLS8YuzA&|w|1%5n4<9=kkz>v!Ukfqs7%^2;(A3w9KNOO
zB08nv{mFH;nV?7E6OKP`u^V0ntyR2}Q-kpX_wTx1Y6L?c6OsI<R>b&Y8<H#H(5A^v
z0hT52SyG?6WQ3BIj3|vCw$<DmW2jvzQVabS1l~_E5>I$8Jz)R#!Q;UgT?ve&-rQ6y
z7>8pOx3J*6P@|{$(bl{e)!HLdi7#{L$1X*MZP`la<nQc3F?CCZM$r-0=>>_Emrwnz
zWK3%OIB83~{SgN&A~_9>$l`iL>^_}BX0%VpSigK3sVC6(5~tgAXm~!fKC`>?zz^Ll
zdrwa`UjF?3QKGRaAN;8OZ8`G*ahWdZ%~x}vLBLXq1nG$LMcKl3Mn+ruFc;Ea;-t|}
zL=BYZFk4-h4?=$o;8ck(?tvlARSaE@#o|x*c0a`u1R6{JNT}EQ$@7m}C&G%jZil#%
zcuL^AN!QD%Sm3@Jg?6xFX~)$7UgPij>+~o_TIy4FjekZ>OI|tqet!q^5+E65>#Aq5
zJ9={KjYa3SI2($tiQMm*K*LnJ(j{1i`R+AfmN=#OOF1k1Q4=+TnS<GqWNfL*sIwy!
zqH00!LVo#ehb=seIT~AKBu9@2f88kitOeEIYNK}+@aef^HES&B;mzkP*{U>DoOQ*_
zm-oxtVS<Io97mPZX94}Qs-i;jnEI4y_eOkx283^tN0j2_3gyY4xkp1ae|x?BN7T00
zPbisFMQ0wcs<IQfZmlSK5Y-PW$Hyo72e0O?HU#oipl~G9FHyWLI)!yF^o_-L+iqBr
zygV%7$HugCr#`<Wp^Hu{)>`c78vDpu->$)xCCI`O0(T*oDi45+_T|;-b4h2}Fgow0
z5{ch>6lD2kj-4$0oa(j3Nl$YG?jnT{xHS95<G+v7oZmU9&iou9;$Zu7W<-fVkdq66
z7QwgohsnT`_)9C!dOdQN`HF2@CV^s$dofI?QELH#`lVrIU}0=RO0ZUsed;}e6aiJv
zogcS_&wD46g&G>8#s>T-%+1DSVVp<ixzuh=&>MwkL11xKIXO+Yy=LkLB~e!;V*Ri6
z>g2P883+88zMM?P#;PcL`J#XHdse2pft-F(9ZN7M-zbV+iQ(pL!aQ}TE)3INf|_cr
z?bktUoe~-yWmAS4^SmHyZThB0hTBNSDP^ci##_t@j7Co$w2Hp*G<PalC9A!|7}&qd
ztPEr!K_Ti*gZs!@V>r@)6yRFpzEfZCo&Sf0%M}qpCY(OtK|US&51DdnsPDXc`Cit!
z&hcB$Bo!@za2*Mh#1L%BSM|R8AikN^30%Mn6uPsuK?s!1V;s_^<Z+7CEC#P+2VZZ~
zYe{l<{-DP;Jp(bZFhMe<Tez;$DD53T+Eqxx@wKYKBoIQJI4F~c?@3c#rPL+Vl!Wz}
z4v@Rj$%eQ7n+w|Ov@~d|W??~cJu9sy;HQ2h)UT|(`%vG8n3&~*dwGXKzXI$W4@z94
z4Uc__!bfVaYc4J|ah?tw9%`WA-UnCekXLi+?}&`TUX4)cp{pa#t+=tfBjwwpxA*1A
zr0Tlr)W)Y{f%K~U`WLOn4~hV!poEQ2JjTyK>rKdg)9AmPPjw;7m;c1A*mU93yeaG*
z$l0)`T8_No&uYL8-B6r}>c32(!1Rb+qC?W$Z||7tC&eTzOLWqPWf}Yb>9|&-ZsnZc
zcYglC1-a?v@%7fx$?e6aF~MD4)s9JyjPY;pK8~*w#o+`DFJtkc=?GyxV=K8;`Rwx%
zOKM_RRCS8|=+1`x^IX$iY)s$2VOAJIV(L|F|5RyYHnDE=T5WEnw|lZ>QOVwogyUb*
z85(+vdE9O?kb64#=K=QDcvMfOH^S{(Sc=xBNRjO)p@@`S0tbC78eQ!UDsKp#cJR+&
z2+3?eBagjV`O{>@`@&gkA%4oy`I?x1f;wOFw1R#dm1%*Pp0d@{oW^dpmqK=|aH3|O
zqJxBdTO!U-_O7pkfw5VH=$V`cj3U<7!Me4U`}hAms8O68F;v$i?vOneFhOeu{I(up
zTb;#3rSqkKDlbb}trVfhXs+})eueprGs!&XdQBf>XPz+z@8jguMTJop{M>r0L~D6P
z4Gx-%V8}7HB&=e~AE^P>-V7-&5YyW&5pbJC$5vd*uSOdeX6BFj>8H}BV}i-%PFaQP
z5_8beS2je1I6gT5q^c@)XKG3O6T4L3{5DE9rz?t#ud$*b_S_z*CL$!3y`M1u_CT|1
z({R2KqoDnHwG@oB(XG+aExFZEpRs*IYl0M!hkGPHUA4!pnVzuF;tcP~hDYj!#&~Hs
z3Ew2<)!ZC+K00?4i6kM@fq~KD1Bh_{7XP*-+&c;^Z`B{a(X8}_&=Y}uhA6N;{_-@)
z4&HJkJ{fe}XFOx?CXsSpQLk~upM~>zsla%@6u!RF&+dV)L8}QbASP%D7CDTimJ7c0
zogrzI4T{LYr%&R1#meFymGj{h#zv*a@k!QRmkwD=%fZHe(CMo&E&oA<E+96WPP-xM
z{e@~N(O8Hvg=bC7Sf0kv8aJQO0vmw2(-{%IXVt+JvqWP=Q0=#v5Daiy66$*l0FKIk
zBzIX+)f^a6O5^J;am-k}zv~fMjm!xAeer02Zn~q&a2&)UQm0~D++*LuKPp_JH%jS8
zi>*|MRbC7n_@q-%QJWqbwU&O45J^T?oHFaYs-a9)es{mT9U2lCWgHia;m6$_dTgB(
zq&y=fKdnZ6$G*1zEB@&k7XDIk)`m27o&zi*bGa?MupFw6>T8GQTV55J>1(iby5!iV
zpGauRV@x3}T|j=P*iGQ5?q%V&2n-X0yej=tLv$YEEvfMu^<i1b@h>C}<?A1X8&7dJ
z8H|Np&i?caHru$y&^yELD@tD1aTTvS#ysCSHpJFb8ex*g$a`k`p^}S>&uj)AvcK6!
z@hPPHZk*VixTS}!xV2(8sc2dzyM6CZ7C3;IS>`q?7?S^nZOsLCcIR<6s;o_Hoz;Ni
z?wo(<R5TO!3lm3bVfNUp4P>;x;KD{ZnqGF#n+X<08I3s4{&HxV(fc$e+@RnweQy8!
zk5xTxf04ThCtcv4*b@zR$}mWhm%|d{8Zpw<#68t_oMw#G=?*?~eI*J$pu1{HzoKo4
z-~5hRv+GY;6#*y>%kMIZqYqcoZ?k$LVLbfTUt8iKr-&#$?i&sfat~@`lBPBeQ|V~c
z#P8E*L<7g$l8&b`vdJy;P9scU2lrK}e{%Qw9#U?X!M0v0)Gm=mw#Md1Xb#@nq&KZa
z>>m~Cc_S1BUDSLPHv~MyR#x!Pq=IgU;{B4C(|z>BBLcW)fd66@dOKl^0LF2^gL!3f
zZRmzqC*tbD^y3p)rXG8Z+la>6a3zl-9YZa$`dTWo@^`fG+q|4B+(PJ!4zKk=6sPw_
z50@4HpvKx!R;`uYO3$j9xEjKYoj)E<ozSbMpRkh3zcYA2rn!p^Z>IIx3i4h%b1`1-
zqc0`WqLOdA#Sx|9rD+QH&Mq7=y&tVNBEMbHL|HBMJvAp3H2m@WSwFXe81r?pmRac&
zX9EY<AiBA#oQwbZltT7fDFLDFv4OGwlLn&IshtytqoFXx33S@P<TxZ^G#XgNenPyK
zl1MX^NM;!_sH6IgPa1iHN)bM`pPe7$0D<pH!XBs38|%)M+Cki>M@L&(cUiL#>pQpl
z)_c=cRUL$&y<54${=sx~Ylft4(|ya|3B>2&`r5a~3o4YJZ3nPyf}bK!*T`b?-fsj5
zRkA&|sXyN{5EkM#HU0R<o^@??Pb}iE`QVSVV^663Z;JsuW&R)k^@XsI_5U;(|9{u4
zS2vQ=XIPSdUFFSm+4*j$$x{6eGEc5wVODV~5phAbW;DN6oTuEatNd96?&AROs9U=C
zD($@d%=|}+<%Wi~V4saeg~C4fkZ+xK_T!yG&FKgc4(<lVP5R}`ej6S9Q`@rQ;<+cG
zJPd~x#ZY+NI@h%Xj3h$$#&8a~c0umt!OhkTI;!G=<?^7qmB2f<H1~iV?60nbu}HqT
zqRx!QST4?MHIN%hcyYUEp^~g2!{dH8WI|!Ga`QrO=qsdxgNv$7>`R|PIK$OS)JL2y
z{U|s37>$22Zqs<a5Qjc<&25ALB}?<WrfL51$&ZRAa2?a)y?WXNxLtU%N7L!YoVnxV
zcg4f34#@?D%#U&?XAVoIh53l}CvQ`=<e(eLXs6t<Z?ntowCH`t$U~JzqZ`%(H`o?t
z^b(ATO|5L38Y2hcM<sYoQ#PEY9LiCo++jst@uF%+#tjQ2*v4CR_#fnbd-0wp3F=?y
zR6BnQ?`743aS7!G;1nD~p?)Ev;Ytmi#bMvwDRzG#su5R00>5KUkhf{iT>LZ*g{5-y
zb4+B0YIzdlN}#}3n>$tD`y@l{i`$&5S%j{`EY}1s74_jADTfJoau;RXzHUXkss`_b
zAXVch4#E+t<#gTOK?$XAEsXv{*&4s*Ry?uQ71*OHLI`D0f14EZ_|9cNyb;}Z=oiY%
z57!;`sWe+L{Qy-E?Zk27F(x`!P9dRSonpjzxju@*$h#~izqElQ3;L6rB6yN2G281~
zDHQsZ>aiN}>@!lA9@E(2a|=rSoQy|w?(O<A?fxu|8%vah1o;29u-I<d(CTK5*w37%
zv4>Tf!u?Z-_4FuphrUJQEJ~XTb5wNbQjh$aRt1QF4wU;aZ<kkD!EN`?fp#aYXW8RK
zg!+(WJAc5AELnSUc7YD;uV3I0voSxmCBxH#bT3Ame7bKhPCes4F`x$sQ!uE`OeRZy
zal$Hvc869RAWgSu=>G2iek;bDe=JPk@SYdT&+iDab(D8}4sIrKes|Bm_o2qn9{F;k
zQnKnDomgx_Em>is423w!Dt_HY#%}3ku_}^Sr9I){jt;F=Hn3Jy{W4&Wg=zVnq0u}2
zb;Xo5o!_)ysIiD`cU?7ctn(fEabWbz61`HX-GdA&+Ju8RPvgdz2D~IG6$5eyJ+n1p
zwW%+k;x+Q<<b!@+)bP`>!G^WXVqKC-Dy;o*R?NT@1D_^fJZHP}+#*zY?cpbJGNl*=
z`+B71bmYBxi2+x9uvb5C-BX1Ig5&x`21G@@lXjszD*xi(0G1=a;~{O9)o8uQC2Uek
zXrFNL5tv{eQi~3}>ZZ+L1Ql86$L(iF%XQFpe0ZZ7D{nHg<+&f|h<-AzuJzF&;PjfX
zh<!V(g~@-(x_?~m<;Q<9m_^444BJ2)9dQTvPs~W0o3q@TzV}zHiv<>}S>kiG#w@oq
z<4NPL&*M?$lpoV4N|z&Liro|c5NI~n*5=urYtA*NAiR9DVjnqNxnO+XY&OcgHl%(i
zNylq{;=|{n^7?(Ck*+-IrO7`py@-5s58O35^^$%Yl4m~^HNoL{|H-edF5sm}V4~Zq
zOSeLmm+@v6Tx2^BT1XC0VZ@alt7{1O2iX&+@Nf>x>lHJ<f5gLtLBtO;GV{cW1P{@z
z%)L{kDoN{?1L!8+xRI=1xCr}`HYylOQa-Ec<A{&musZF~Tc#S|Gf_?}_T}`EU92@H
zNXEc+G(euY?SFpGq9)dRCF#*Um{5+?3!U1*+?>m6xSK1x_Ny8va}vc6?|a)4qj|~X
z5R?q{txxgEBdDWEJx`mPfuFbEYWjzbDQZIzMj#3eZ_-thTHbXS&0-19qg_bA@{ND>
zUBmZ>8GM(E3}T`e^_+$)r$$Jv9WhUoL!#?L4y*)^gtI2EOt5A{U;x3JZ=|96Fg>`6
z>4226HIizSC27ChSm&YUQ+{B(9F;h;RSzo!HR!G>Ey<kDlk0}@){pQf4df~2I~v2<
zo%@zMUGd6H$S#lVO2=)fJJ!e3suFcFyL`1hsL;48x4Xxl+S%N{ig&5?<t#~Jb+#o~
zIzQ@~QtB?b9wsoia_$q(N-C_ySCs{><DabmS}QwaOxTlu7R3P&NR#x>`~{6>8k;GR
z=yI5if#-+;xeaM-jSujY7aPMURZIrLPj2i0CCHrfQr*okVG0)Zv6C9kNUIa>)QKqq
zNoT3v3Ca{eB69AsDM#kmhO@-DH$?zEqSjttoUX--7p(6x9ePv}Prx#yEn1fsl>MsS
zor6s){tf{S;HKigY|Doq`Zq>JB`$~s5Me9tb*$i)Mv?mmwkCr@jRnu{AqAND&q)QH
z!LMDyx66pisKqsN3d!Qv$5{B1_um@aj5$awwT4Pf-xSUO{43h>bYA>9$10gY9?zPX
zm%4{kKh8Uo&UwTL`*jPPw^ke7I6#QWN>}kWv~{P$bORnwq4&8Ki{n0LwAG6_2R6@|
z$N^mSJZ8;i&`tQ5e3y%eu&ONs@@Z2wC+@g49*J~a>=`SM+eGHA3)f4;z+^)u;2P4}
zy!GE&0Fq_Iw{{17b?nect5L1df_lOb$!m?QOqk8Vgk?hmQs%fSd#QL!<1}&%;i~<+
z`bf~+kpH-D2X@!&e*C6+k?Lt8*zEPl$k^Z(k~YEB8k~{Y5@Yb5vCsuP@C!%GBtE0u
zXUYw`ok90QEgzrPpdQiCaAiJdTMaII;tw$*nyzhmNq@QC{~nhL14Fd$1!34S(Ql<H
z^hL+itRIazYNGOG6?|7dgoY;n0=!ewtTnadJd{M=_Y|&qa<ichMlZ54CA2Bkv}KvM
zXqP@?y;}n8ujM=Q$;22HMW^5a771<@8<jH{TVHCe;|KxM&c|T(Lfj!U;liYFaSeV5
zaku?;b8w+&MZ@ve<oS2lm>N2Nitg}VcMR!I^%^GmC1Zci7nFZMV%7ha;lUVOlS3;W
z!a4~zH>u`ffFQEf`jR4OlAVz0_1m<taqRb?XJy9uc}TuMg8bsNq}uPgI;#ttK>-$H
z40c-ZNw?d0Xl<7==RUm309+%S)bE50F-Y*)FhnFG*zc7WepcfZq-SN$UIyR5b`xxw
z%TomB*S;I8`t1h8^YHdTa$ux2-B7GOMKo{w^><HymjEg%q<4uN3Q}QcvS19tL3JMd
zq)%bg#78e=r0;TzRGV10Kp;pn4M=H1#DjbKmkr+h)6QW7baif@Wk=i3W<I8Fof({I
z<R_9U7is5gi5o6Q+VI9{$L%gD{4EWW`%u3*+6b`Ly$gc`k?1B;E~9k<%9P8ujvmsd
z=95o%JZY5vr|tiUJoM)4+>gY8`aM3F4c5Cw5Av0B5&id;rfvVE&HKiVzU$y_jkbvq
zoo8SMe%Nga#+V{i<8h^j%W&T0I#l~YP1EwH%t6#YZ`07u@()&QlHQlu4MPZ;{D#hP
znLl?&CvMcFoXMryujjpuo-+FxAw-=O?asDj*)d7tP*@Ipy2fr<TOA0q|3a7}$0pjw
zbW1Xu4Iw><^e4FV64nRI%-LCv)Bd@4bCOH_dTY}!gjO{4B`M!6U9j9NSz@-i{U;A!
zx~indW^Lj%t$Sh<Obuvi<(@B}T-*deS=l!w8MRkAFsk3kLy{*I<XXKE6R^LHM1Dqt
z7-Oj-yC+i%&uvMRCgn_-NJCvGc$6TEID~7P@^iFnx&*5ja(RiVw&N<hz`CuOnhFQ>
z6v7o<%Ddv?#doAj&t-(ECd3E~KY@F>JsaI%3d}b3E-NmC|0qs5zOCcLjgA}!`ry^G
zOh>m5qHsR^sR7v0V$xf~0}?+=QX~4Is(Zv^&ltShP-RFq#CO<6&sG~+XrrFHa&Rv#
zNX8GrOM5bqx?;GChq3-zN2rq<(L!VIxo2z%9Ez({LpbB%7kP#-FbJGgOXe?GO7dOv
z5%o?S;(q_i##zBu`tg`5g><`QNxHZuu|Gn5=0=`6v8gT5FNyc5`57QNI0$$jTcUI8
zefbS~_su39&|Q2pd@4>1MYMW_7h~6l3k)+ZHgQBK=>RS;B1!VGn!LTGY&ga&e(IhT
znh3ILM8%b~@CgYWML3)^1)2DW$vnSu))K@*3BR<enSQ&3>-?sc@#j2r|2boW(HorU
zyk4u8V~Tw#<;c-<!96EplGht<<k*1hMc2$UXv+k}FZG9K(S&2-xR~|SvSu&=`#|LW
z=wlps23_FrqF5wTOCsJ;;e$|?6e0+NEDI<$FnBdanU|Qs>|`!~chHV9kTT~=AmF%N
zb?tO)PDh>UJGyZ#d(g$<8ss>eS7hPXH8RcHk6lnDz~QQK=9N^1H&Ui>Z+jl%`E7eQ
zv$E}`8c-5D8Y!Z8NZFZm_@88@YxZAcg)EUr(}KR{!0rYITqNSbxz0Z$GfMNkttzHz
zN-d-VFtK75k*Pw=m|p69pxZqTSUAVhgFk8?K>3$Y;m@sPkRzsMiW8s<Eq4d*PAN&A
z<*u+h(`r*YdW$A&WAW6iZythhkc6Hfc<^49ifbeBK~6bu-N04oYAv$0q_i>m&YOPa
zuEWtnE#auWb=w|a(~8kaYnhhuR(Gp|T67-O<TU$yHbNSU^V*b}V8op5ZF-_w?_HQh
zr>L$Id9Hgp^kPsJw3AY2*B|tDhb~m2V)tYe?~Dt~X^TjVf$tw<JxSqHDEHl%OJHo?
zNMCxfWl8)dcKTeqdF0i*`*+nZa1PTbIX-mo!Y}Zyo1I&VVs=JkiYei|7a=-m6%+eO
zMY4v@_>8Zw4M`sJW<%gwb*Vc9W@3yr{AksB+Y-tVtBR62l(Tv8i-9vWd|9MkUt784
zzZiv%q8&X0;$cmlDF&e~6yG0mEk)$p*ck$9y~D4%`P}u91^h!XVvJug=c{>Gg64R2
z$F;Hiwu!+wIttA~VK@5&@zTh`_YuGSh7ez>`aepD=Tm%~2lDN-83o)C!$WsEgLm`P
zhh(l<;ht*^r)9MIIV0qctzk?=2_K=US>PDO@RaePD2Is=a+_Vn?S`e?u-4*a&KsKo
z2(i-nBhy<R=`7jf6J37US>`~mAD|)Ab6MqS^f=Bu^-ShFyFn+xrQK%fQv3pkusf-<
zRH09S_1}(o5fKxTm_*e0Oi5CAi@bSA8=wPn+xxs!kv_%`#40jUux@$R!Eyp5ezjNZ
zQ0?IYY8MRI?YgRlmtt4y`Te?w7NhX6#n^N4_no)jsJx@=GIKE^uW%0w2BsA{n_tfz
zK$gs(hVlqUXDh{?W;+_}0PE_P7CQ@yBUI(+j1x|*8(i^e@}k%)0#eSWQ`kFavN(gg
zqJahOHDJ~FBf7=K4?=HigduduflWXd(cn#@^i-Uicv485b%$uoMC3#zvF@?i-PV(0
z_r{4C+`7ORLn(qWD*A@h3V=*b$<<8w9qaPQ*m@Gb)S+7$4V)i7pDX!<ElM9SA_*$L
zb-qS2yTLVDc{qhk&jH78GQHQeJpoYR-nQeL?Iof7NFGB~<i(0RMCK@(X~=xBo%j5>
zeu$k+zW(?S-8=cf%_%0Zz~ErO<oqqy9em^ZO}Ik?e1^1A$9_rnMtd|zMOMaWO8@*r
z>YAR+eb*=)XejZm?_Yx$kj<|c^kMVH@j7D3?_EaR>F9ZR!Ld}upXKP_a3$^L3JgZl
zHgjSa(xU9IwfmY9SQBTxlr}8aRQKZogDvRBa9*+RV(D4s&2vd$^TI=xUvo{Vu>h21
zg`6ztjsUc#Zt{%d`3DAqQ#P$T0j@lHM}R4}%2sWbd#QO{ak6q>Z2Q=$zAii0W^_M|
zEqko{8~KlT4f1+4f)k5=;o|kWD3i!4?EY@jryQL?jhZ^;5BgJotR#@v{T~2}&x4li
zRBVFgBNyy12Lti6d9t{Fd&lGdztD6FlQ1h!-uR)jbnCwSB!Mu6UpZjWRTEE5un^s3
zjVS$UhAI8ZoCIN=a+I+)u2oeOsws_saF>9b1<z==rCcR9GY9fYw=HX~c<&wqzOmiy
zl_}i0YL{0SAWF9{2A>Z$-KAXp$SOo}YL|&zIZt|HY*&w3xh^rVem#FuA&9#O4Jkco
zOsm=)5jzwgbjryLElfKqlJ7|{g4G(=9kOrP%V?0YTh)$t0zw{Fu@n<}W44X9@?2VY
z(0ZKv9W>dk$eN^HIa^7JWhMJFI~OEEbdByyK5%qQWhK$CaI=o0PQ_2}S!SI%X1yP$
z-Ce(ZW!R9UX&THgtV$^KWvy+p-pAwt{60yrgDthrzD|*@c}QR|=}H)o?^>Ol<I(DS
zO^M4|KOGMz+`d;jNv(trZoG}C9}gkaH9`_CQ;NgB{3tQjpJY+4Wh2HpFX}8v+jZyW
zqTX&6nE*u8dn(1HPPw^fIxxxKdr>9+6l2$>|0lPKl{R}<l8%TSZ_KhJq}s5yn|sf#
zFZ(Ix-NCDUcl|PV2Pzgga)cT;b70hewH26<zvhK%>LFDh$jDqJKJe=T`Sft9mAu2!
zQ+BZJhtN|y!&S<cU?bDPX#!q>fEU@B1w;u0BAud~cAT+SYWm$j=>(ErNeS8JfgR}b
zf-r)L6?g34f6=Ebn9Kb-N;h^wXb?u-KgzD#B|7MKEf=eoRR#n;$r0pyU8?=TH1Art
zb1W7N-@Rd6R8$rPnjp1=lG42$Iq^)%)ue;j15$I^YBNIxsKE)PyO95sRX9mlqsM43
zL8~QOsYu+9nt%&q<Q1jp2kfVJ8bC{e-6|W_a+>u&fb7V~1erw#>@jksFVdthF=#|+
zM}O%>$1!Ka;4feLloD1C|4;-OMpzKgjs`|#n*$gUs(0Z<;7<$pce%$qKIL=8j>Osw
zm`hYC{`I9+kyEQ~_YHc%VV{c7YmSj?0@2#FB+L4wtjkiQUZ+3o$-0^`u`CWPw##r$
z%!-DPg@l~Q#Cyka0qT5x=v0O4E$NJfb#I+sHwel)b`%t;s`-^8Bs3ld`0T}$+<Yrf
zUKWqYY&;p`FYuptN?$75TI{nE7B9ve%=Cs=o3VCfuyHMytbamu2ev#@v`M8Mwdd-1
z{0ciWFkW*@8~s6HS=eU<o#7kqI3|L{)N+r@BCm_^#8CU_Dny}EK*=(+;&cE~tTd>N
zd&0J9a|YSYK+!{tt;nuPm*cxtD6zK^-uHlf_;1x2G+QBJ5M3UOrp@Vp`Ql01!RhRX
z3?-|=?6_jZA15KkvZbGcza|nEqNHogKMWH;EFgs-u}t=+FU?x)0qk-X5oldnQkqG4
z@v0;S<vNNQNOvH|I2)>I*C+|EZNp!k!`qDB9dX}Q$pyie?9yicxC#s`|7mZhd%B9B
zme&p(jl3MIRRys?86vl#<ol$~@Zmw!C16%}o#2`KkVH{owDD9>jm^559Am`)R$4fE
zPtUy^_Hn-O;&zQk@<U(Ez&PR0)R+Rm3t9e)rDmR*MCi^p8&z`i{x7EYDsy<_`u2db
za;>apJWlMc3D<F<_sb!BMt&-1cS6NY;>>-UzO0e9@L|EPUO-Ua)GVM4UuO<{!1Ii2
zz?s6*egSyS(737J_3fad!ePGcd`atNDIci(Z>h!cQT8n#$LixizG+&CnQ2j-xbSSN
zBXx)V9_^~TrYo*7)VKl~e$KdoAP8C$E)&Mja7Ml;mEOOo1??W9x<eO5CCEJU5-|Vv
z-o#<1nbqg<4sG8(G{LL(qfJ%(mKgJ^@3#5G7B{R(JM0vaA!9wMMzG)(q-SyQmLTwO
zp=+glcwq-q7v^$pAvZZc8KNg1Oes2*(u<t4syam-3w|B;^d3;?oosp&fw9P|<ccT#
zmK2lS7Q=jKT23$fECw&jqbjZDljf$G-?h!npY%{;q2LJw(f(rw0Z(tLGNPWFzday}
z(xtLnAn{SWHyup3u~XUM^Wy3D7|d+|^5@JkjY}Zp-V&TA|6HXr%154+6Q3&>OU9cq
zu#zBm3VdK@zGpR;6yT}~$J^u|vS1+nVlU3YrKxl$J{XRu4}h?`A0j62*Ltwr4G+<F
z@<UL>d$w)%f`ZoMshm3ZLs;N0M)+7hD>eJ(@b01KT%5J6co;SDbm=pZgRras>>Y3d
zel@Pvq<MS5Tv)=fMAfL#q*(`+KaXm!dAm2zYJ0wZbmn+1SCy8U$@09;iJ$C#0F{BI
zrKOnC?d9*RMSPYGpi}0ux6T&|S!|1?T8fH_Z!c*l)=r%{jc}k-FJ;f8T^;!cEb1B|
z?~0!LB}ogj;-#fdN~K-yC&AB{X{DupG-mH}(DO}g8GurcCsjx93`xZb<N1!Rae4Df
z(4|)AGdJiAaodX%-(N20({I8~$U&bfzs%#8nYVSl`nlCErECmzHF4Y)YAJ2LcTia>
zvt_=@k6m4ICzRC4cyboWoy#aH;?);(DGz<F+=mB_;JKcR+?xYjUZW&;2OW~nt4^ps
z8s*<KUEAs;@@`D!KVly^m<|=PK-YN|9s72|06kUA#SeJf{{%D%6I4`8TfN!*32&|3
z;U+ZU$|+r88&CyY7C*6$)O7G%D$>HkP51=t%f(zE|2R!wWhgE!1?IXj9y*&%x+hf)
z96waDmUy0wbWD}HOREbF>5Si}8$M^WKE&5$R=yX=c)qT%EG+UWDsq%9bg=Q$$u5)`
z-z;pt)|fV*?G9AzOaCmLdiCaivl9TwDgxu|15Tp6wU5K6WWt>=N*mTsY+f@xR#mIA
z=J(5ZHq^P=&xRZ6*fYjX0Bl{~F08Ze)P(jG6=}>SdN3E>94C`K<dUkmpMva96qVK2
z-A6$`)F@SY;0D&JxydRnY?v_}Zr&`c=D0Mp=Ww@_Np!GMYDzm=%d3ZK;Y%quXevY<
z;L2=TDR(ptfbN8W-?4mk9`WNYw)MfA+G;iH@O@71F84W&MWf{(J^1e9c6V9H=b%)^
z*0$krkL_G8d15a8!OL}A8kU%wG@mY?lyQ2_nw{7>d(0MhSx%hxz>ZULs8_bCYoA#5
z_aTD>SI>`#&iJw)^z)0H13LcFljxDlg@P3xu0_wge#Yr<o8o7lQtr)~u9wbz?e*wQ
zZ>vG?Ol)rp*lC++4Xl9C0$5;5R%dbNJ*KhyIR#Jl`y-+^-kfs#Qo@VvAa>D^Hd~(4
z-?7_Po(UR{KxgZykhmeIQ+K?`Ea|jWditM!7>C@Yb!rnU*R^W%J2()nKg*PdO0=hT
zlc_2{&O^(9>8&40pb^cXxxiD47+F{Ka(y{ApZK0b&oMMDJwNki((BnP?yac4A(Qob
zEN(WlxrUo2kv#!I&_ZX2!b_)LP|q2-!ws`uL8HN=s_s(jEsGQA+{LqH=EvM{-=*a+
zxUSBF!#3Dv`Lp!F^BD=Cy`cPlE(lDppY&S1-5{7JxENM8XV0K3VS6l3z0JJkh_W2-
zvt2xD+;>gOV(iM{X4CVyV)cqzC(st3-tC+yqPzZj0$L3myARTN!qcT$y>a{O5z;jA
zm<VbR*#i$L{rpPZ%r#1|jfyJDqertr`&V=NFsq`14S}up2rOf;M_mn@pT0ktW_N-K
zKPThk^Y%0PtoV=L?UoYux(+hAVEAI7a`7i6Fnu~YY&P0shj5<!kz2Wduo+B9*qj>J
z#arH(Qs~p>>Tsr-X9^X2QM$L>ljlGyLPjdZ^ZWu*BefBYsBE#972Y~eZV7W~?*(x;
zr?ondnLfKiS8zEUok0J=8Y`j~9E{nb(ETvS=zN<n*nwXnh46I?&zXGw0)x;!Yc{|v
zC7~~@6}Tu6%g8f75nvo}%nq405DM>o_NGMLh#UY<A8lYW$WEk(kyxi>q2c|>{eNQD
zcBcf$Zr#Fdh+s`j;cTen!b8AFoonufKPl4;c2RW>>k}$UxeYKB9qW+!^Mr-oZU!(P
z{ws0+tP~>B?7WD45+tCkN)GmyiEp+Sb>GuHe$(+rlN*;NlAAKeoDj}06-v|Nz%N-K
z2dI-47P>Hk{FN=Nspw2zRpFNKYZF|2r@XYa2d<<sfX+joskM^QtPRd!?XYwSfGQe7
zSyY~rRxr|OvBba5F+T4(9nsR@?+KhFwS>^Yb<>)#RQFG;n{d!-AJ17+JG6ty&Ny#f
z&{24~t@!|LoXsx+cY|y<GOYip^|JAnzp6LEAWOVebA&bxdoBir8_2mKB|ZyD=-w{l
z9*^7Wcu>|91k7t*bwq;m`~6x_mbHB+%D}8paaiwb;`|Is@qGq&c$ulcRY5}qLcXkg
z-R5WH>G!N}Qiom@vTH!|vF5iizi&rX+1XK{4iPE5{$7U6va*wH(BS+KTYjfarxlfJ
zkqnVu{}VG0uXj$#w*mW>*l}HfLzdilYA4)2{z00qJyG3Y)vqfHftva?PDV&-!hq)8
zQe)WXs<xfd3`r56gbk|S)m4TFb^~)hx`7N_{C_@K@=_(?(&brMr<k-gFuGn;LHV%J
z<*#k;)`SExFYQ{E<DeF8y8&YWp1MPi`YBFUjIvcra|WV(Y}SXF*59743>Z4T@|&7v
z1lShrG)NE+)ox=&T7bN9&$ZG^<3uN?{BP7>G_R=PdceH=$8YS<vpUuW7&dGZIfxre
z;4=lcg0FRVcAE}80TYoS+E7jN`Gs|@!EeAA11=nAh@{;=!P8Xp$>T_;{KWTq4$!ws
zkV=)Szsqc`cnBA>JmJY!>&YA*DjgZHAnCl<mGmYi6f!w`E1h#!Uf%}#lIAi7OK$#)
zj3Pc<yHU@Do6vhVCrlQqjc%}tm@d;dR)yTJ^)UW2N_3Qm`*AysPrHqXiDr>1qr?@z
z*Ll%lq3k{c-uOX+JeMLS_Hda)uTQ)Z!g38axAi&cWAv!~$M%TDh6jV+Jp7>ZbmaQ>
z=+x)T!8vbpJn4b7k{5eOXsu!*%Q3fEAzU!ih!T<n{G1bF9<`s?B5m@Ytf%N+nk^q3
zH=!Q?#g^-OmyNxH0R9YGgtv>;mLtg{AswD~o2bV;p}zZR%BZKi6u2?;Q7O<<(|6KJ
z65Ao=nRJt|o&6&q;oR60<9bW7BDSc1`p>PRqIH@hQYh3N+IuG)DM_JKE0+-8Hm{CE
z(Iahc;L<fdmh&8iM<rEKj_n{_J>s<<-)?8n`z}8>7tfssaS%~wgnnodDY={xNCsqd
zCSX|m+)sjKWsizg#+jm%H4<s?T4SNcShKzF;j-c1C@wv_X2Q)T%R2{2MZ*9`4YTzu
zhBFr>beq$$c2v_}{8rZnTqiJo8aHq8*md0`vx>lk1k1R@tbV}@Q2dq@!9Hj859I9O
ziW<&$)+-QyLX74;eTkb6&CaT+COC&4P|OO^VO(n33a(RdJkRD9K_?)|*c-_`o|MkP
zIrd~`#tzp%>6pP5+4XH5S`pV=PNaEg;=9>@D8%S*e$;K5=s`#*C^$9WY(}&gJD@av
zo$$bN`6-S9{JhF;(q_=r(q!$>pnKwrq>qw<D4_pRoRE3F1Wra?Sopy<bQf^C9N!Ts
zyi^otv%bR4!U#LZgi+Z#x$&T7l4n0nz;DN%GUAo2CUl0&=%yDjZBv@51^r9s;QYi1
zt_EroF?7&k>f66iHhc%8UUu4stTc*SR2uDcl$1ROMz>LA(Zx<fk0OU8uXzpY7+pX8
zywF*Fhq~n0xicyUYW@4W#AU0<DVUCF-a(%g;;{;y_q;FZZ|UAQDisvBn8L66(04L&
z7jv@mGQ7g$oQHKxJu@UwX~m^Zjf#Mw28D*tw$c_z61U_sb5f_IZMPB*880kItrhP7
zk_H+*!Uuz-fxGp#$`WN`CH72O>e5XBXx0uOL4{P0)4<Lv2Wx77wvpJTseOTdLDNtQ
zp?XpXR9qTE83U^5SL?#;eDpq#!;nuIbWnW*3<~w^OXn3;g}x>zy&X$4)l_6S+?kSM
zYH-hBRE1rdoyS-%dvy1m1ze??Ppg-p-HP*@+w1!P$-@%NRZFGNfnwA(0*B+m#H5Sb
zs!5AMM`gzg=f%scH|M^59=R?ph_4sSF)=%Mtwl{@MQrmiS$pq=$5a&gNu@nJ1DPXT
z89@6F<%2RVJcrd!!Yqmj6B1E$C^@?QvUB|}{!Q;E2~(C<D9^{AU&5Pe&Tx3bouB8P
zy=W@x-~}i^^%>eF)_<UmM`TVf5H+0}>JUNR*>YsACI{`7%kH*S0j`+6+5iM?E^AHe
zXEnn~-4(V42YhTV@5>_AzmW#q?J1nV81#lW5+f@rV`s-;VVof&yn$pMbWs&cate0Q
zc#uwyp#(7@#|Fp0F}wauXbO*PX81u)3zF5jO2V;UU4Qoq#=Fh>>eBc~-w~;r&)J?D
z54NP-11kD1B_qL}a)B^eRWGmX<e%$Rt$H@DaRny&iMVs=Xk<BB%9fX#V-w=M9xPm3
zDH@w6t@z4mbb~o2Hb!8zO?U<P%Eo6Dr27TFwd5h>Kf?=mPaSEk>3|8`l;Md>57l)V
zpU@n9>spLu*~L51&D0owh$igqpXJaS5q_X73W#o}Em!N$`PaN|$2?5`z*4Rx&AWnk
z;!%!-hxFWmt(7OYIk1|)2L;wCY9+!;5xvB4!$uy~_7!GlFo!)t3-iPu(^$3cDWJUP
z`ICEV`o2#Z#D>4u!&P80MFq5a+;cC5+s>%f9)f8cIP~yFIy>lp4JN#eK{I|^j~S1>
z;zW334ocacG3Xb{J-3UKT76s7e`jzNrq*fdJr7eoJ#f&{@xXn~OAsJyQw#bpmQN!N
z1e>AOkm-A0!<B*#p!3#+3yi8nOc-j_&K$VeA;xv7MCmNe{whuCu9>@pS4R6Eg2^kc
z1G5mP%ENb5x}Vr@Jw#5ONJ1z4rz)*pkjmOTy>(MQO0v^!_{~;j)Z4!uvC_O6-0mzB
zDf-ETVwZBG%@isdf`VR#-9h1FAWv=I??LA|I*Q%-nP?xZ*_|nwS3X+9p1NSztaDr3
zJ1mH-O|TN3J2ZFgS$qJu=dp8WQDZ-XZ1%sk0D47B3X(I+UU(K*P2Q4H-asZJVu|cZ
zq0c_c`&1&V8okzDWk^`KgBd+bQO6sr=gW7Ti`&gl3co5<#$tPk&%6r3&z0Z4gou-K
z5C@z!c<!O^)%N9NF9g)=Z7<}zlxTM}*^Y<FdfEuWn!3>12hU1fkF_MXl~dTyEyr)S
z6H<_XYL0RU4iG_@qMoB}JeRScn%H+nD&kAlPnle?CPsuFY6`jJ{W(sdng&N6TmgBF
z1$qx)Hl9$<aylfa6Kq{A^zkF>9Q21CM6^eD3b?}YE-%`c8(sfBy)6Hl_wRkc%PaCG
z__vD4$3<XA+oeVqg6HDQOb^fTP^Xcg%U`Ha6|^A*3d}kf>mIMcl>X)U=9c&wVwtYX
zF<LWe*5x@SS^asTkee={I*-*y*t@>_FytuWq=H6g$rJ*>;@-?<(_3h>j>sr~l|>fY
zm>=aDP?AvC>xTwexqtXD)=cS-)Rg(7-THQi-~fS-pZ`+=00i@0xVQJ7$DgR0)|u@l
zW;WMxl+F5SD1+U1-YDG86;mBTZ?%-V+D5dv9XZ~OFXi8>y3WW#wb{k+pb~lLRZ(%0
zTOBv0HAQI{F16|0_~zU~*Mp{Pn5Ywkl}7!AKz?pXdB@HBm1rs^5a9=5{@|$r1I>~a
zLoKxL|A7Afk9M)5|B|%oiN*cjXigmF|C_)IDRt`QmbOluw>EwSDlh^OKP<hdlJEiw
zzfBwzor=*wdnJse8A*S2kiVa$byB`}SKDG>Veaf3qzU_`e>ww-dBhw|5kVo2C^rYi
z!MX0FprqomBq#sFn}L?=v4P#JIPr8K#tgVKu{PNMtG7h_&dj#c3Up5^EvX+Tn>gZu
zX?cyG2;T02*Olm6p9AM5sZf)&n8qxk#^2?zQeCrAk7Ai8s?1}`uW`i_7&iU-_aUx<
z8^|$yN{lKU{>jF#UWJ#o9<_XF5QBY+s6y9aAD;_FrO^^x(idqJ=)1t95?Gn*Ut2h)
z!d=7XLb(!^kCgM<82AnMMDnnN2?`XOX;*w|b%ASA(C|W^k&0EzOgmWYw-JbnOKEyg
zs>I7doVp<H7{C9&LcK1be_y`*2g!s3Q<gycID|P_1%+CZ$DE6${g<Xvld8HShF@Bl
z&jRe88$`oA$J4uF;Z5D@??FLU+R`euy+ILgSoDiEoXsgaOJgOcwSM$r{^985p|`tt
zW$wj{R=NPs$q=6w>Lkw&#H;xmZZYi=!V~gS(YgO%;qW{-k(kpqo1idLDxJ&h(#J1`
zhc7kT!h5d{^p`A1AQ=DuA|yqkDr^cJt~!OFkzk=Fb2nu4?K5;}TL0cuqq;0_r|BWj
z7c@(!NhR*n^xB+U#(K4y><0H43c#lRNC};~dTi?-6O-{5x33KTr?PQlE`OgO0hjWA
zcZPsQC@P_G6m>6)Uf0$KC1vnH7zO59<^$C%>J(EynYpJO=5H5E(5K8x_|Oh_H!YD~
zf7CgD@Jenz<$h#Mc-0~W_#m0F#_k5jD5Ms=QrN(%&mpTBSS+Ah+x~Q^I8^DvZ_syN
zS^-!UwOKi@TV_OZmCY^g7aOx>>`>o*&r^Z2%u#m8!Q}iiQA@#UMB^aJr1M3G4|VR6
z0A@If<g>ZC>1LF7=+oZqQ@eNjMi)2FZ1WeG!vOuWQq#fmDPBv{?P1}Z@<&s`)@VW;
zgjaG;S3!dbdmX7U{ZmSG*z)-fSw2hmK;u8$qMJ;^%L@G`ps(F`iLPtc&2SxM7fg!W
zg&MDPs`RoQ`CQ3ly5%N?73r<KODtB1(r0^A`|L>H%VE?8VkVY-`USxp2C+UkQI61L
zvlGFesb8eVX4{Nxx*()AEi%2?+c^s*F}0tA$AkU0)|0-c2a6f-r^9b{DN?S>g<&k$
zXPfq&2rzzctT4Q8Bz0ESlm3qOldJqs84d0$KO4Cob6i~MzKA|_V-UTJ-gf(p;Akpq
z_U@u9Et8zNrw<;_1kvhwH!}(>>mX^rps^!%12Nl~3pY%`oP?t&c7Z)82!{i0jXya9
z=4?4WNUwjmXOnrhmN90n1#dnxP}sXO#UbD;;5+K*_@V2%QI_Ft-(ew0aQi-sy1!mI
znW`2=rgO!fRfK*!ivu$^gMyGw7t_Z4sjVjPMjB=LS>uyfO4#HmO6eq7Eh)piH%|N6
zAkGFmjchwm^=_Ni0)jT)>&C>57I8pghtB_J0`or#W%zUOQkxkO@dfO}#-M&QrEQdT
ztW^(AA3`;7#<Q_DZ-aLpc3U}bJ3#p7M%K<c6=qIKQ#&9wXM2AMPPaQ^tFi450P{!2
zBem6E{v?nq9agY?q)S<8q&qC~8=k=nQMlz7vgc=?(B>x;Z)~UE-k23eRJc6h4>WLA
zYvJ30X(85XBfYXy4^mR-Y8PqV{*6|c0-xyNv5zUIAV8S5{@#4Vd-&zjU4A6As;F{(
zv{pEFn!q~O@;ijzo4(q5NckLCrS0JiNz4vNy2;^$|0YW|vMQQllV;il_MA%+PybzD
zA}@um-GothO6oJoJkNACSo9DQ-bFVvoPg<w%9;o;NB@pygoMg;R<D1_mF9iSNNT#^
z;wRVK8Ab^y|LKp*=4RLBDGnR(q$%f_CHIWkSf!~4q2KrnpC!6%nmszd8m<c6ZD%_C
zAkO!3Rrposa`uX1`p@zr&8YpkWG1u{Lx{4AA7BAvx5F_FwU-YanUct&1~i<)>uu=i
zj;%8J6*<4Z-Zj}>b+hH(;*)pD2s-a;nfZCzz7!tvoCJlru&qb{gyC^LI$-(^fCio`
z<t|YX`Gy09;CH;jzJA}NqNna^h(yKjcj6SjRUDmZ4#zDbu{?3uqf&TSee=wynD1~7
zUR~ZlQ^`x&4jIgdonewEITi#=r?r`II>vJmkX0Y2;O+?Bo}IQwZ8cd~J&<Es6&OPq
zxIEmPVP}A3tS$1G{k|=Oc;<Gw9)~lBi0F?ILqxgO9G3q~AMPCJa>8Yz#Lj_tRLd}3
z!dIhtJiy#KvcVgtI<^W7;Or;~-U-^30h^R=YcBTiSyNyVD+<V*(qL#G+v)Zg_`{_k
zIjT=Jq6nOQD*5Uf?7wC;bfE7`qAbE1bUv3wPU$;8Ol!t-)^ov?WZ6Ab58w)ZVwlp`
zxfh?PSE)b3d{TiFB#p)~%q>~@gKDE?S#WGe!ZR-nj<S09*{YspZkjOWCfn@f{f#Rb
z>bqJJ@AfVXYn21TFRwO<2rcOIqKHh7sN|>YCmgkoMSGThf+R@!CWrub-!+17(`xR`
z7Fh@FpGL`!Zs7T{-O@#fxJAP%S~8ifb6BCvm&v^+jLj2+m&|K2Eb~35Z$r*Y(cP<o
zJib*fB+d2zEREt3(4Uc-T_WPG(&u}_;>x6le<Dbk%in(KLBZHRCUvuTL3<$L!x?`R
zU|R0Q2+^0Kf986wpR2DoQ%Yof4^XNkbeqh~KP$%5Kxk$OYH))Q*+mv)vp&DzYjU9Z
zhTrNKz)4Gk3d{is-nbIa$iqbLoh?#Q4NsNrcGvUTpJ9hdVSqfY-k){!h<<r1^pq(s
z)wCSO0OP=GdsEy1yoE+9&-)7TNp+|3{||F-85CEyc56cj5P}7F2u^|r*Cx1oaCdii
z*8suY9fG?{aCevB?%vS!=|}dicR%~BI)Bbr_5EHg)w9=}YtC_vao>nWH&ay_qqWn;
z9jvH1B>ci1WHHI!yaC1+9QMZ`3_|->yH?BsI$DA8M;>zY)^C=>RbDaPZ723*xw6{B
zA*bS0ot`YimuxG?(GEuMGF))bq=>1>z0z(*dgY%#r4R45o!;A0eSInL$hZKPN9-nA
z;g{sTRCEv~Qxq`KKZ%S=gg4)19>U^3-k9SjSulU!<Zz};2gWG84(OCoAbg^VY5=p0
z+Jc_<sEM&=ZvUQUkDu2aXF>fqhTzLr*VA}UQ81{#xnkq2v(X()2pTh}xkR-fQQhk@
zK0TmaT1_2^rpDk!4ERc32wo6B?XodhmiF0xNFSL0dQ>ImJAGn9{db3=$Hu3&<p9aT
zH=Vz;JkRko<=or=H!s6{58pQ(x7qEB_fh1{(8Iwc{@#@IW3ezClzBI*u`KC0tbaE(
zVv-8u`FDe(H7wU-*{H`{sDHI5GGVJJ`B%pyeHDzzfA>rJ|IvkK8#;?Mq54IY%VS#4
z<);xHmotFGbN3j^R=eHz%&N?SK|T*@mielS!k@FwgYnN%>{ESNy2QEb$ViaoOqDW&
z=^5O*yd><|1wNZV9CW)%#OGf3D-Fr4TEzHm*NL2NW^vy0WPIPU?zjof$7XxI%|qlr
zdvTm|n`}GFbZEKr;mq`V%6ey`x5&U?kLnqLvg&Jn@`b)!Cfj9lr~gr>VZb3G)Yt_i
zeLcgvcTp8N=>dck=}OdQIRTik(K_3n>g(=IW%3yLVtD#^kIlxw?6ZU}Kb>@(e&+JF
z9D8YP9#`&aOfI8!eVuY8H|hDRG;D*C1&$tPGub{l*R-{qK+_^QnmilGs=H$XKMT;_
zukxJzyjfS98`6uW3app}jE({}$Bt>pvA|8HqM<(++4bacOWC-gs+%|*RAOa1Wr3d+
zd3nZUcEgKRVx6nH7@H5x&5Opj!5<td{AEOz%?8O7_-~wUT@@ZS*I6v<`5#)MAY*ak
zTOT3wgwW*c;>t)*;^YO253L&ID5-pTGAZ0=NNHqzG`ajH$2UNnceipOPb3;w)9Tl|
zTU~Yds4|g#CZgILmCVjadfvu7`gk1fkTI~h6kuyQ!7K$FT0ZCbLXIj_{^0W4g_aM5
zems}zau|zyjc!(Aoy+(sypg5j&`}i~d5Qa^l+&ib^X=}Ih68xep>_^^;s&dIsbmjF
z^Nakpy7@)Z-qkh!S@RCJ8MnOPbAT{KWN@ck6{*}r&&P|rqiZ%G5X%S?tHvt#ZJ>5W
ziA7G~qQ7IGh!f;+vuD}t<;K`xPq3^X_BN-gG6*CTb2C%cTvL~_$(U<(2+f+*HN(dd
zg)9=jgm1y=aF0Kc&aewkk-3U5E%A2BXuOY4Am*4kKoM#ANJy|z>zG;sGDN%rXvTbn
zK$}Yie7cXr6s%G{(j(O1{#YrMjO@gziq>pr0+vI5r?Am0ge9If{i_{bP@rRiN^tF7
zF;@w+Xsfx~fi=T}&ftEU*g~41%Ap`Ac`s&+&<nEP%IWLe^!;vgq=ZQo_(^_aOh)NJ
z;SEQXedo0`ii?h6H<8q9h@b<G`0v^&<!`@AZFtt+&I2m8KLvXHM0C!o(lgCZ!f7<h
zaY`@sIs1^QzRG*2&o!O}SW#~ykKwe0|CWfXMfetHi=v87uRl*7n6{bk#LA#$lJ;44
zhVysdQ)((P4-Fe!Be?v%oq-KM<Zh$_hEO`mVb+m)haERrn31^Xt*G{X(kAA)#)>~S
zK<-Bs-6)gKwl5H2=EnxG&6;*Zv+&1MSfVSyMFiSQCwhL$)*M58`YdOad|C*~CF0Gb
zbHA@T+78hY<md1YxZ^StBGk3i9xv(GlF?M+VCSG=OU3*-ELaXDI>xF@TsW~Sz|Rq4
zrj>J@Ww633y<Kb9*c!5e7|02ICuHg7QHtM|?E|ZX%(EGIx<uMazj)Z#AD3QLSk(|{
zGs<Q|H>7{ELXprEzc_sN`#k5Cjl3om9fv1mY9FVDY(UuAn9mg8=uek%J0JWrjARjK
ztYJn)TS)uB`81nH?<+gOf-%~GZN2@2{N0K|f%B?b4U}G35lm?F;}knPMb!Cj@0wR!
z84DFKgCWBKnRWr^N9FY`Y$52uaZ{XQec|MG82!^mN}x@v$D3}!=R|QuVxp+b{5fs>
zUJ07CTi*91vmT@Nid_xLeAcaAgFLh6u8}aa&tf#T1aG;<ruVD$zrB(4y)rJ-Uxd#!
z`s8WWRz&9NPSdc5vcTW9<~VIu5ID0=-v_=B+#RGHSfPZ`RytAQkRieMD{YKZ>JZ|j
zm8zy!AH@!&R%(nvxj7D|@Tnn1fbC9<Kjkj_eRL+3vctFluesE;8nmSvTp;dd;QQjX
zwO5Z@3WW4K3q&6&aUTMNK2Ar0h?F}t<utnx3)~BKtBjl#DST2bmekpTbdcFYVIr|^
z<=h5x!(Zz0Ui!gECyq1AaNnty!TKED3#0`iTvjyFBJ}Pz2ZmaBJNKDcPiH;fHMzu1
zL48NNZmTIm!cu-I2y+t{6+SCj!5CG^%+=TG?{T)=4Yn3He5kw>p`Y+DPF2%rs<^~r
z#)D_EB3=jkF*jVJYS*b1tXIZrJ;i1fd;3GPjD_MX_YLEGhU9>QsZ6?%mEDzI*hs{n
z(<{o~2mW}?fZa&+4T=Z8oTAT4&^mJ5KzEUIDMl^6o6S+TvTIOnOy$;#4{n<H;4Skr
zy#|x77&aRe%E|_xaKAe3_3j%NUd$Y&8=*>;ADXJ7cf%*QLtmEj;&Zx!XCMK#)#j*T
zHuT1Y_xCSX6vx^0)HXmNkLFOO7>a5`Z{mpZVs~dbzOxgK+PxYkAU|(kd2#EIdUlu=
zS|EQ@iZP;~esdzIO>Q=>f%sP(McD>l=<<(1d_>czA1}Lebq_Ch_)nA81$GQeQDU5z
z5^prKX$3wxPOGa6&Myl$8p$3vKWox>F9zLbH`RFS)7($s-)lKYbFnAmmp1TsM`xR$
zow?`{Ac#Eu#wTvM&Q?pZEmDL3owI6PcqWO&Xf;fZuXnvZhbGTRpJ(SEwC)zPsk{I=
z5W*xBaz_?t9qw(Hsx<=QgVE(SMqu{F2l-R{dQQBuRK}z%DWt`uI3a1Ca}D~UW62eR
z#p)diEQjRO<ED-Lt=Fhs59*KCA_!(>t%roeAHAo7%q#I3QmY9R?RCEBKlh+-GdXVC
zk<XMxwdY**jsJCGc87CggtU_?1IR&^|2X+29vbib!0$3r^0Ux3wBDpKb)cuj=9QSs
z^qukBf`b}^eCSzQ?x77G;nufWoL>qVs4nruG?36X)SRfQ4V8!^(8%F^SH(wXgdGsR
zpuE)1D4J4u^w(MG(rpns3A|g1VuP$c&|jN47M~0WAAR+$-1gO-9EqG%h_?ldDZ12>
z1zcsg8nOwNu5Og`)>ZIP5~tqyjr!tyB}F1CIpc}JAwiDaiDraLA(Xoy`qPKmDl=$0
z6->aVU30AZj=IoVmYP$D&S23>^Q_G$vrmy!YWhRf_wENAlIs=DvPmN#U+|pW(O{EV
zdbq#+meFGOp46&V;))VcBr|Dn;oE0SwNlUgpp|>1$+s&w>VzR<^7r0GWWG_@PkQk5
z*z`}~J6(*-2P7mpcFzuZlsJ5YQf?<g7)Np*3?J^aA>$_U69q3%ADli1Vk1}&M9=g&
z89uRqPbSA0iq&#bNmSs8`xSi<FV8)$oSdM*?Ml8{C&Y*G>k70`f0TB>vJp6r3Pt`O
zk^xO|IO9Pfij^iuonh`<wMS_M(NU}^i=O3W|4$6z4O>&}3)o{Bk2`a~j7C&sf%L8j
zF@^-@%Rcr=;W??)H2>FMR2pz`Ou~LgS~w&126K=>L&Q9p{h|b(kQf`5ySi4yx`rk%
z^gOZMyEwxiJCeJa`SrPZrMCx=;7DdtSM2?0m^{U&BOv*fQ-<vn7;j3LHl+%G*yc3=
z-o{%v2!gFdY$NlRnWPgtYP8#~5K?BPt_J0jcI}KT&7!`p7&7f7N%vxZlpP#QhurBM
z{ssfKJfRa7zSfK-P?winR1`U3bJu1HaQ0~EkbUIRx7qLN*+TcIL<!kV)u+r@NG?+j
zZ2oJ)2yItONP`6e-eP|4`v;47@;KFyxt)=0vGnZ-Z!&+?((R;P_-es)I<&1CxtSL+
z`D0{{&nqr`+Ao?7ogn@-5xFzcm9~!6A_Chk*iNCyG<n6WHPI1CM45}Ex(heXNdOwh
zFNoXFh_9%yvH%gZzb6cHKY=1$V{SF$7NMA$5!Klk*C>bJqYxOxt+R8)luxGM%^33N
zHu=r2@G~I;l0N*dBE`*~Hf#H4|DFz__xZ`Kx_@vgE~YVS3KFId*iggxq0YQ#u8szW
zIyd{|>G}GJ0q9Ysq%m{+r1y50S?HE+Q64zWCLmZ>pMrJIAoq&x5un5DOZ?uWm{==b
zIHD~C=3>eHRxFj`nJ9|l+>^<Crccj{Z2l12ZA39PCG31%ltI%}G3oUNZkBaToE}eO
z(sg@#UmtMMeZ)TN(i73r76KBTT+ATPA=qEQ+H65MJXmy`-W6Q570faba9kMbkELjK
zBC@W91E!m?X>L90UvN(hN2N2b*|F&7Bw*$ai4kT^0$laLYtAPv+UiI^qe(1YcPqN-
zdrPF>`Bp{ch`#oi&>wSO{W-Arn1w@ms*wXbIEFYcb(;@o4a_+`QX<OLalfF;!({5Z
zYHyA2)Sr!uG))J6u_kYAfyzBh;`O_ZLSbm@Jvt3a=vB@$_M?kC6t`8yr$O#)9<SH-
zOUQPM5uq8etM=XP(wz<{&Me_wRaVd!4zlD`wN4F{JkrxUbq@axOtR^^gBoHrDk$gm
zHiigOApGlOw<-$SUS7m;gqY{hf9hSAfNluG3*UdBuU~EG%f9G#@kNQxKK)ik#*CM4
z);uSrTLshB%YFYin6bpUEeO>c79H2je0td~byStr4qRzdSj!Gxvn~R?IgE^xAMyLZ
z7GpASnKvE8#}g|6O1f||_a5+MdV*;3g7rppHU+>bW?+(0Tw(#7tK9F$v~2N(-f7-4
zaXCc8O)d;Gw)*S~(RglqgYzo>Rof<QIP|z~^3uEZS*xCErym0o?nb*J1{`o<cO@UA
zS8cZ$LHG{IZicM9A!flG6whsDXNz&*AFit%rAGaWN{cqesodQ`N@(HSy*I^L`68AN
z--Uqox>taS60bq9^ogTncHv!Lk+g_|0Y&la0wQ<arv2W+j)l)3Qx|)-@2J@g(PZsz
zdBWJF?4^-!{PqqovX}xjM6OK3fX8*^+ji8kGKy@8<ZUxmOuhXXjC*6qHOA~RRje5x
z+Zx6ENpCZjyQU+GI$i7qmDjt5*MeB`!j0eVZ*dI@eIzVZW|pxjUA|~@#rO5a?vla6
z#?yv1(km33O^5VzZ4!mN=Jc@X=#I%tdzBYN(QLsQ5BL~y9=$fx=1lDQ%zWRD+ER{$
z$>=QqY`eG7B20mYI#^p=il}6S<2VxuV#N2G#uIm1!EZc}JzqjxigiEHy*ClS7S=hS
zl7C!d2XbqcdR}~Js2N?ZG0|bdgS4AMOeWafG=ww8qp@ZA*yc?4j?#GELxu77VUOvy
zl?k80dDh$+ExStO*_@9)xi!`o|Ce_;Wi?D(5?1>ADY?%RSE_ou;i&~)kHlCPDczK4
z;9Unr!4y{Wx1FIes2yGlBE4z{$AIXfxPffS6wYqn);Uz1D~H<D#u$9gM-PmUaS`Dg
z=f!!afU$gnT_3|4IsBLQ9rp(v`K~)NoX1l5#RjP7r|Hd+9+-o$3D1ldL9F#e&(h3u
zZ$=TamPmpUzD=R*Y1!*z2`T5M;8sAwth+8s`3*~0r*aRNi{Q5#hu8z6vYg5aLvUOi
zh%%%H>L?BW;)J=RX!9aTd?vCnm&0}1?U=@sUs`Wgg@`4*>TnJ|%%?nXLLc6G9e*Yp
z36^S8KQjZ&GZzZ{ken#mm+R>C8VEBRp!|McwoZr%)VT^PuDecvwP8Ordl3CzUessD
zH{=e_Z}jL*OVh^7ciP!v>M(oSLl+#Qu}7<e*^bTAG<Om%cB*D3B7m?u7zUGJl3Z+I
zq;u1gae!W&Tu4PCk}Lk*&PO0G{dE=7Jq$j<r1s7XNjK{%q&~}WTS8OSa?_ziE7Tah
z@_cG_3$)g)Q;OX{-v@e_=R_5J2I-q}5oj8xa5K-z>8UdW-1M4M7j$uTsOxH`{o98b
zzBfyrW{?(c2hdEqOr-P^R8j;8p!M7DZNTy=(>#tvr3GTy`+7z{$v$fH8n(sReGnGs
zf#3Xrlx+lwS<0q$!BrWxuQ#Mj?-RF?FKQ8V*n+YAquZMa+I6H45rPi-Bw0O%k}kCS
zU!idLqQ<p4AAFq9{6`gu$w-@4JMIXyHmJ@q{G_9NnCs^n^zQFD8eeliWm?@D(9djY
znAMd{i%FO{GdqD@5!WRv4kUe*-;~B`&-pCv$V9@d`o#=``%P919o;;<Ni|;!{ft{b
zZh+z>#$uOSnDJ5hPqvc;?04m=?Q`TqW=uOni0sm$>hGfWr~RKIU_G1D5(ZV`%8v_g
zl+Go!VFwAs4$_VY?+dzaZU>i~!uzBwoDj9^*<9UcEH1XLO|!}aSY!jGWk$%%0PB+3
ztBzTn<wDTUi9C#tuYM(OZ?~xM@#(X8qtuw?fRqvk#oAiR`~o&*_=U5AlcqZt?9tEL
zLDOP_ZM)Dm8T^tMs6Y6g8V9Pw@5`Bu@F2aUd6+<JSe)tq@&cRzZ<pg>OQ`)`QMj=X
z)^N6;D*w`SISH6=zTU9adsKjFE1+lhXg9RKo^nKw29&P)3uX$O#~_<k@M#gJ>U#xK
z$D$eL?ZJ@qT}d1YQCUBzXG3rI7#DU-i%*`tCMXJj4uREHo^#5VdC)RLz&#<=r!ihU
zd>z|sxmhqtTe%Q64_8ew`{Hp<$6b}i6!j`;1Q~n<s<bq|C&z!O(Vm;yW*5D@+vJ(|
z)6Fi^jsq+xN94FTb+4mCwlBV=KqqFs8V`Pw)8BLM@W5xbuTixAD?9Z&DpO|+OZ<Ie
zJg7_ef(m2KdarF*cGIS(-Mi(TkZtqwkw9{$zbsb7ndC+Dqe5&J-Sq2$=t?NYkPfaF
zKh<}Mrd8KR3bd+41D;j!HjWT$)s6#_MP0ZwRP$}Nl&FJ4JZRj!o9jExL6pYq?EX#y
zSBrgf1x?GP2aTwMvgpsT+k%s$5t-Ieb2`c5>V#ZZg_U|D#?@QYfe9VYR6Jq={Hju~
zfP>sbzu?qe2AcfaCeLuS9|D~^fxO+51aj02Eb(E#j8Bg<#GHO69hTV7Q%91Y_?Xoc
z#PP1rGYZ7E$H~C3C3H0;@5BXR7EyMYaeYFI?{u@pCP>oFzwOpZ!_*hHGz>>c<;H7z
zNawtJMTq9q=G1<#n1q!>^_>eKnX_ev(Ds@kzSV_aB$Lcb2&p10d}gY0x)qLDIeN|8
zh*m<kGwUX-FrnGjHbt*J8WI{Xw7ToP|11?7WFx{%TNdqL=SuFSD_HX}9a*Fl<07P=
z>I2}C-8wQNg%Re7j2>-o5SIy;cQd|Jz0rq~0}|XUukzE1llRnpm-$gv%IOKs1w@a;
z^)Yx%DBdLJG8ZZsJ5E4Fl-$^Jvw)d76wElkozF0kNhVMDgkeNj!cr&k^nma4bOJZ`
zU>ui=xbwx&GAyf(`V$Xi`ww$Iu3w=eNV*=}`aX`#{ZHY$)3)ls9#MuWeSsnm>Xud|
z&Y?ME^F>7v4{Ow0A$EhdA9^NdCZ{I5;NFX!AWqEoLF!PYk`hvW;aSIA;1wjF7k|C<
zgCs}y>HXswD%At!7q%z4J%w)y6P18KUJ63(bIJFEe97dx3I1DSkcZ?w^E7jf*|%6j
z)#-d3v)_}&=4xe>mTQ|KGy;i|qH5&UrToM<@8$Y_ctw{jN|i*WiM<4*d<0s465#Vc
ztD?dLzPacV5^S=de@eWlv32E>rI;(FUH);F`U2cz)b`o-7Jb6&6xCc;h(_|CJts35
z-Z2W2qA|hT$t~c;{T1u#F}$SAbY~ON1-!*U<!Cc|%t>vAmFQbvSeHleE(@t0MPJUJ
z#DT*XlK34<Nl!?#Ji0ujHbWvI%=1*7W^BKUW2^`)Mt8|V62dDNIrC<noxNZv&TG%m
zF4p>(9_ll%G5O3INHmwU$W-k_z>_4~>ENANhZV|cQ?pITfx<O7__OQFH4a#}h~HR$
z?8sIReBpUi=WjBs5pjq9+gA&(;zZ!ge#PDr?u_A(t3Sboxa92X068A(YiSTpI@z>w
zQEv9;7NEUIKK8Xb1ZL0n+M<tlE>c-Ibj@ON+8mY1^g#j+iC{2m1>rME{9pmpQT8X1
zGS@YCL&k5A84y3P0!|fKdt5?fB2hG9D1MxkL^<7v)Z#)t>vcDD#&#kaH+Wcc^1bR`
z0<?pPo_DjReSMj2%3a&4tTzB+fI2lgNJyqPFt=B=-?#`EC3#xTuJrhA+$*R`%XBZD
z^+k_d4se)yyTUksX~AXn_;hU#@T7mYDVRicPjuLSsRjLsrK^uiNQgCIfyoro2SE3!
zB}kJO&}=W<l7vIt_Ustg$r4%$^nsR(@l{w+{xwl%*ZATzobZqn#Pi$I`k>e9%C!zQ
zQ4-dxhwovHr7`RB(g#ve90E_e4*IMO?w{$W7S(!Ql@{fT7)|5Vrp7ERyvZiwjZb-s
zA=_kO+z@AXgRkUXkz8i90O9#Pc}0~<kOviaAd%AuIt_MwS(V&_`8^T;VAPV;LPhH{
zB7SRn$tzv#e7ScA1!(<Ax5mf&T#yH+)3Gn}51g{qwr2rv8ro1`*~QldB!3scq#6Df
zk+I`Bm!SaJ6#oHZn)CaF=>JAGAIB^V6d*7Bv#WIPPb%zx^PB%?7kW7)CsX5D)z4i%
z2Q3Rj*t@}!u#zL{#I$d^#nO`p-+o(ak!=*|$|Npx8_2pz@nL}Yo(%<-H{uVD0v@|>
z(WEMx{|i*Ou8&i|j=OK+GIlBo6>kEsYbJ|D@wsj1gP%a?E-Gm@jL?-RlD<#u$bpwK
zRR^ov>`I^&owCZeSL+DIq~Gz*^R^#6ms{F`YHy<L38?PaT=QPwS~;&KNL^knP#Iq>
z7(G!q^lN>?ccNHY)^F$9ha)eX*6>43^18zL0ITCY3s-wvzfXN%l1%T}rs02|=UP0Z
zEBoR<$~Phwe*%GYbYA9(bzZCy;4Ry#$8!KFXs)Q=s$P<Cblu}sb$#QttJ;UNu2nn+
zT$4dEr|z$it}9dA_FeYw=79E4KIGY3jGlyP&iIo9snidVb(Yf2J8+t%)~Y#5GB+Kz
zqr6r*;e1`Zg3r6`6#JPhWY}yh)0-mezOF3xKjStF`@BY*J<XqT0*Si<4Jh!+5ox4R
z5bZwT@X?RgtDDqLu7BaG(5WjB8EJrH$1={({OVzyInJsB{ESV!x$A*_p4NqD9{Xka
z^~EG2UdYy)-&&VNh@4Lu^vujq=CE`%O7wEBny#Mic3@!d(3T3$|7JBJ*9?lKR?KT_
z*jeCYU-~*~noDCkm6WL}TYDjwDa^Pzbm7wwxXG6sI^gu(d%kE@(+XAyTVerd-F`W1
z-G1xEy!`8~-SJoje#P~*iZdSF4nIl>o0x??@C)!5*l^L<sX{(cpqzK%0#T=!FEY@M
zY^e1Xm^kb&GgZa{(Vu<5%&moT3MFrCbauS<RtZnr2mE-FUXX)#8Gb;)7lq(QXLN+z
z6d^tVOq5=XJ|Ej>=2d_w3fZ*Ren<^L&N<cm@Mj1Ag=!R<xoJS!h-j&?w;&Myz79^S
z7tD0B=F!V3w~6bUV1k>eNmwoHv@#3MdF#;h9ANP`USs(fOE_%VcAbIl5z|jvK?lcE
zRhnVoWe=qHEXbiTGkVYxRJrQ4A=~ieft*Ewe`xoF*?t{$1lWLySt*LmXSMGML^j>K
zNk5DrMc6XOyQlG2LWxrD6YV6E2cx~<3#;(L#}k^Dn1k56puTCto}r%oJ~Bek!k$oB
zl=syCK%L`rIQTN;kwKpPlM_IjtLY3lVK<rb_FCirFVVb!<BYPNQCt`}C!KxAJ@Lrf
zSScZVbge|3zoxQMW?6F59u0DYryDZ))k?Qb0bleGBfr7^&LJg#T>n?u+XSd)vnp;@
z$_glsN>&Bn)}WIHAx(U;=L}R<y(+t%{Rgd}`1V>qtjGbGIVeq2KCHngrx-t8Eva*`
znrxEBwbrsbYOP^~RwsAZGIq3nCWkC&{$!V1Ln%zZr#T+f5j#^R-}u4`@0_y}Q*_o9
zQw<hk@vfEB>A)N%7lAc{4v|p?4N9zd{N&94vQ*6spGnsIvay}x?Zt=cq@b-nxjd=$
z5s}*4nUvP)(I&>mwrwP$&+7>v281%#(r-2N{jIMvKzrp)U(hBeOa?ajH{Y^iYk5ID
zHq@ts4*;(6&SPVKKMcnk=KBn`X%4zv&))RVuK>`n^vcQ7uy#tioTBr9aQ1AlQl)-B
z@XZXhwnT$2x6JZY%n`A@Kbil$96hyhWpGj1f6$e@KF7gxJy$yWIGAJ*H+FHz3%DWU
zG5S}#2WhyjmAD|Z-5y0-^k+AZXY{$9<Bd7_U$I3tRyOh$Up}eD2j4rA7Y?>nqoyo@
zfzB$nn$i4k?H=So3%i|E0#C5hF{_gtQ@C;Ch))`}&nT1(Y{PZa%D*a61&0;yr(9%2
z&Ik7TTo;uTR)hQZ-UKJT<n0v*KQd^pW09N{-SJ6UySKN$A92YRZZKyVJ%`23I}$*^
zA@N8;4p#v?<7N5T42h2ld=7b6oyoDSRUq<|n!V2!9C<%maXF>*bIKjZ*0F(3GhX7P
zlWO9?Tq4<39@r*26}p4(#5@In++!82OWfXxUw_{-`_rMX_cE0|SHXPl?p4nZd{)`}
z&Fd9cr<&|Gy2}o$F4_rf_$dyq;UBPV%ZQG=|K(X4wK{|nXi9ioPOA@|+G($s+-sw!
z>2~i7z#(XOF_0QiOJB7e-PvQ)fbej6_FE|YQQf1g^ns8(O+*BxJ)MyQ>>I((_mi3V
z9UDFid!Al599h!l@La+fR~-Crkfwd6C0;CSz5aoaFlz-loTV@EG`CoYii>R@lt$^%
z(yP2fK#L83tQ~_<w=F^n4DSX93L;5$d)jYSW$7#e3MaNpsXbcUJ|~?`8oA?VBjC0}
zhNHi|UbNS#J_yECdbd)h+vaEPOZ?kLepNrk{k*!`pt8Dft6@UxAQg?{@4{7XNoSFi
z8G$J8pgVE?fny5$nQ$IoxM2|AyKw`MG<Q}{f3hpKcZAU^piI0D>Ksp)=1-#ZD{D(=
zp5nXkmI|JKQ35S^vaWT-t{vIzywS2KSO7xt_@s%{ulie4VA{dDN;f4o(lWPKQ^uHS
zrIevOtsQj6;4}4CZ52|_>5=pjqlf%YpwY~&*Bse3#9jx&!*U>GosY^I3NCH^P7VX-
z!kaD0iemK(edLnuR_95MZj?jmCyd<`UlKbD^+x4?xe4o}ks&yW2V<$`6Ka20Y+#pl
zfw4JRi>i0xAZM-ooVmM|cA=nuVk?JaiS^{~qyGPp{CBvm5J1Olyk96Q7=Tu>n6~Sp
z^&N?zD)-qB=KFnHq6Oaw+`p24^+kM^Bw*r8c0G}m@I@>Pm04@DCy=wGu&Tg1ecF+j
z18APB^Q!%%@lhPYP7H$V8}2h9u0oZQCY|DiX;%B4n@0r0^h8u_`Iljx5y6GaUWdtq
zIiw{z6ej|XI_|aJa1IoOhea!=(uJd&lbF@9Z~*bI1g$YCaGt0QNiP**zYXtp8(-0u
zx$5zoMqccY8%5-{&*?`JljIYsns)FCIFQ=lTml&)sinRL6#KiyZtkV;pZsEHI_{Ke
z7kA2a*8El_b%%DWe0?Y@@5x{Gn>kS513(gQzcwvuT5B0h0Qd#A?)QYH7(AkrFBd|o
zX0qotB=vj|;WwUW(~8-z>)A06{IFDmA}7keScpL$++OIC#i_?}UnCD~j654n4cP5i
zlITxZ!jgF7gT3d8+Fy*|fT-zac;dYupo<)}X=Wn^>CL+TpaieFxCB}5*y~#GnTesC
z@_56H3zQ1&?UDB^_>G>_4q2_?hQU>x19DFyO>&Ddfi2JYXo$!@RWv+|&U&k@sYHl;
zSG8RWU(W}|5BCE)8Upwlv=0}l9_dy$`HtGw-O7Zd{!KIz4{Unp-#TDiZR(Cfx#E5r
zR<W2=XQm6r&KDl}rO*1dqon3~`5S_DtH<_W87J!|IK>9BVDL>gPfEO&^j4mHHDu*k
zQ8>o2HA$TlOr88P^=L~H8z0AHmDj@=M8&h9i9j;T{nNNYG|+MSgV=<+S`x{a-+|Z<
zW7Y=Ubp8eC%SDC**<>aJ&_BSZ5#jv}md}ddc0EQ%R+kPrxFrfx*3+j7O?OAr&;3pO
zN{=}0GblR7?-}3Zg!gvmWSt?`v)wP@on4y6(p|BK&tto*7s31mpwZ-utQ^2MUBkv@
z2My(s(C}sMZX8>3lPwo?U~Q#~8_`i0!k0{l5$y|UUAF0Cpm1X>Vlh1B@rH`QhWP#l
zC8K-qM*H?or?|dg{-y)%#~>1Peh*eSKYdb2erRcPS}-Yj4~v>q>{znv^ra6@vM2p*
zOVz9??Eaft_dalUU7TtenS$`R5CQB5$=h~$8zdfHj6_70v30gn4v&dlLg;m-EJTgj
z|1_3rbX%ibr=UOZrR?IZ!JM)Hj-W9^X}QFyLP8b!RQJ7O&4<KQWjdyhAtVJ_(I-x_
zek|VC_>uML_#&@CSL#)0V`n{GQpJnU`Aj9LR0ta34wW^F*7zDX?-$Wh2q<CjZEi{a
z&-%>L;u{Rabr^<2l7J8Nc##C9{7&CHDN~IoOQ|mo1A|1f#p~aWMTzh&ro*}ZZKyp!
z#-!%38L;~`zO-ZUZzPpg|AmXd+nUlkd;91%(ZcETWaD9p-RU|?hHGB5=o2^p>?VaO
zgS0RSbPkzKXP<5eQC4djRhQ45>-0($sc-m0Sv?sN<<xl--<MqN;r#vFdR;@Pu$ylW
zNqRH;)Ku?2n5H~Rb%m@^ZagBzkZM53XBR!FGujA1>hmwhoBjcP^4%{NvxNn!&z!vo
zAVdB?Nl66JKqXU5e{Pdpf|xLw>%)`}^sR_Q)}X849B&%luSijh`%`78ec@7Gd(;vf
z^B>tK`z-FnF$D`ZvURY(X1?O{9K^<Q_$2LY*Zxh1vK$ua-J$jfh_qa&^U|r5sV<d#
zU}o=kHfrBHoj!#d(I*Uy6xVG=-<&m-!Zw?S4w0ThkFbBR_8_agj$sBQ{US7k_im*;
zG%|^PE+4h)Mn*H27<-y@ygl5lpOi5OtW6F{O~7Hu;3$5Ipd-BVvgP_zzs3Q6XT8pF
zDhoFL-Q+F&#)cNzwrkj(@D@2VPgg_0zNmQpzTR0ca*|w@st2TD#YQq4pDeUi|716%
z@=a8`36k;h)P`2~3jgAe4l@8$<G?q|$d0DM2+aEq7w_MD;VWH!>cdWNXaDqT)fRE2
zUm~lRH(<}w%kkmyLP$@LC&k6fiKqE1&Gi+igACm{=s>PTG;(2M$e0!bo9%UVactn|
z=)hKKbo_!pZp2i36>xHWn~2Cas63V(8nE$u1l;)7{Booo?q)&0k`~FEU8wdRp)At;
zCvP|zemuqX+z4I!1{aacT>@&l{)9o$H2{|iIi^>b-t`LTx;znw{WmYsK-*=ezG<37
zcX^@1nr#uaKy8;Wd=+J<%Nn*XP8D8*M&Ho5{YGqwZi6ffsvzXGi#}{09A;FW)^17G
z;1Kv<oXRFs+Z4;q5*2@Wpk4F&qx(ONl|(BBddcbaUYVM)hIA?WP+wr?@U>RTygWSE
zcyF4n1JH$6UIv0L5?Z0w7zEN>&}eiI^L65=yv?<UPtRD6<ciF9Y!ID&X^L+;2~`eE
zKX{i0fE^cfD?8s7BNmVG=$RIx*&h;Sf`50{e@qNWm2S6LRC|J5j$sVj5_RP8?t{I1
z>8k%2FO#lF&2NF}v@BcuF$zhYl73&B?0ewhP5sbOkF9p}E0cs98D1<{;_Uf-9toWw
z-l@@hKk@DJ;3Qg;shITdI&At5{lA~#!Xduy0jg5|QK-R$j)4_;KRWbZt=2!oj`L0*
zs8cbjuC!WQ(#`p7`;WMX&lhC_R{iS$2}yE}VVX>PKjMSpYCXt5?EqTIjWL(TR`!fy
zjd%%1vx*p`W3|U57^(2zbaB2G5(BOizmV*#su20?*o<8CNVD*dQ0%_$(?D@(H?O&^
z9t-muT8i$}L#K4E%k6emy;EDP<zdR1wA*;HejxIgIUkJ=6(2gg|LC(N^0zIz5qgq(
z!hcwGoiL?Jtva+^!rh9prA&t#;H=-NC#aDjlf;e=Y&*icw|^8cvbc@REv$sCYuZ}^
zhEh4B<kIYOGlAFNHy^UN=;2@49AEUXLQS_Ky(f|Db}vl#&xywK;<n1ZpK@hEvNj<O
zPR&hW(1l6uBTUUC=-E{*qB2IS&$cO?ME;#~QFdSv^e0u}=0r~vut*3mGC`E>c5g^5
z8jXFC2gvL^d?Vgr=K-8;=UqcPD9zJkVter4xb!g~7TLhRcYf0XV9#l|w{sVSdMjRD
zRWIhE@T?!h5kj~GM}@-rUYl%X!;#g=lfMZs-JIG#K_Cz)eR3!jM@y0M>;4F*;Oj2O
zR4335134<o!=x|5{bS1gVPI7&Dw(y9A(>ulLm#a)D;}R~OopmViq{MNqVx&n;VVT7
zJHJ&P$O1_{M0josMcTKGe~;<C_L~*C^rznu)Rmbq*uV^nTW&Le_3dpYxe32nXNZdL
z2~hw62pg#7H!3-s%CJvy92O6aR*H-5q@XYx#m=WcL$Rb;^#`zlvzu}U8d|jqZ^J}c
zeSKi->5n2~i~PPNI&$n$?fIOaHQ<3?DIKLA%z!52N<N>t;Q}z|H=)*BhO^=LJQGj3
z0$vk;wtKUTh)~I3oJ&zWVD}U+{srBXyC!amFV26?E1_&!7lhBM0Mf1akt%a-M%e<-
z?7Kf)3{qr)m|}n;ao{iSlF0l6s13}Yt30F`e$U2m0pj{X4w|lJCL6D2ZYFC<m%pyI
zWZ~^w#(f+$69f$v#W<&YZMzl9ofNjMEdI%tak|S?WQq76NX`t|D`5JDM(Bt|_ZJH1
zxaf*U2Q*}XBuMSy)o;C0U`PEHme_f<XV7Fo3}rX%Sd=>YuyN5b3Sj8)F|ah*xjp-i
z^ZIb5y7+HQS6mHba5lWL-{s-Cc)7=Lg~Vg{qr@=x+8f#s*Jm26QvS=kUOk;m+M#QQ
zJneW-VJvp`+o8LI>*u`GSj-5;wjEv@_W>GFIVJZoJ6r}<|IZTck)HLaOK*h!fKvoU
z@K;>C7^A7jVsss=2;t<^5Fa0SsCyrb;tfC#8f4*-`Z`kkW=_1Mn9rp8NW9m^Vqbg@
zA*0!D#6bNDgFc9JXvD0*>LQ($YX9=|*!&t&?y@COZ23VF=K3NOJu_i2#EbsZe{d3;
zGcu(*E0DwIk<n%65!hPyL)D7Ab~y~ci|b~QPMZIWAMmNJhE`3hxF(Pb{jP5dQ@iHG
z!yt}Dyvbe|HnK>n&>fC)jk!Li{g_NK7q7b=J)-wZ4(^lZgN#eGpI2qli|>RG0h5=B
znwYMlmIl?akl50l2i>w9cI=DsHEsWguHX31Bq>80L_GSNFU8j-Wi+OX*<Xp#qZwQ&
zgCVXUKUy&Vi4Jy%M_T<;RJefAKlJ^IG>^gI{W=})7h-k>mZevUFduSWG@)y*G9AWT
z1`FaVM1!KvZlc8gkcPm7Zwb}2^rWhNanF$4gGvQu*EQe$S><4O5VtcW!9a)M$0k>W
zyHLlJXCm=PB;HICjwFV^fkSJlxhh4KkrJ4B<l*d&7p{2NXf{n4^KElFLEHx?T^4Hq
z%7?r{yBmkHn2y2nwh%kA+?TKTmhqI$+$~-ZRmSE=#17@DljM-le(Z}dG%fZM<Lc3h
zjMECt*s^yV^e@p#L8cOBs9hi=pwqo%=!?-@!AiI^x;)(-HE#0UW%T`wadEf0L0nd3
zoK)u}9<dWg^d*pXISYs5DwLAH?Du@x$<o@*5KsbIeGBS!q}!KIx-_|fSEVUQUd0n&
zY3uS^T7%_$qBMws5Bf7;;Pw$F{@!W&<|_h^oM#u^Y&Edujp7Th4ng(zcB;k6T5|)U
zH}#_kpfv39Uhy)3RSxy1LCq$eTF%<@P;9Bs!yEi`@|t@3R4s@jG@vsbGHi+;Rf``l
zOf`<jtQ226LUdnSq3^s?jShR8k$eU?5JakPg3JfSuv&v%WI^M>q~`UPRgR6@9|q}F
zGXBDJ-}^r!<3CGSOq$k_F6&y2TV<e>t;=oOl5}*nc5*}_k5(yo=rPJfFy%GW2!}p4
zySY+B_AJo<R}uo0YX3<M(ek~@qrj9!rM!&1Bnp*#!GnEjUo5|aMaGXdTro{1{tZfS
zA4ld-jVnzz;hC*8l7o8xg*Om-+A}|o@%wUU%9kp4<E%MO@)@2WpkA0}0a^~St{{bj
z?&+<OA;_Tj=Yn^dcj%-&IeWTud0&9&2Phyu?UL^?{>bf~6|j-P3~Fu}*GSB)o+1{}
z>&|xO{U+SDY}B8uA>eWxeT-4({pR~WVj}Nw{z{2#rHhQo2jiiW{(p-k0{$25VaI{&
zBCl=Q>)td=`X>)ziVPPSM}ipi^-T*Apdjha_D4zN%=5O45=P^&dlu2Le0N-%z28-m
zocdYI3IoZbi@nxGz=jTG?I)n@5$k;2K@+4Vss}7`5|l3MeD%$p#~9SV=mc~pN#)mh
zZ|+NG*gt=GkY8WkKKt<B%KSLx;d$X+Ac}`54x1x}`*!I!qjgVJrMsxPn;JByag>CA
zc>z2fm>I$K9|DhMiMhTf?8>jCnKs8^zWwtf2gF@@pz9Vk2|iW4Hy)L9l;6oIzM|MG
zmRVZ=XpRK=!zG=3|7pbkLr7ck1w2GK&02TJX4A!PM{ylu>RAFL%w=#mcG0g}5HKP6
zR)X#BKSav9MY|0rQt+TvM7FdDpU>C5e})0@TOi4oi?1#}6Gv6ynOA$6f6`KLEJ)vJ
zvel#VFrCqEB{0MgJ)u~DmjquX7YyZ{AW4AOCF$@;A|bPzNHw(iJr;oB8lQ593#R!x
zy9yl?M4s#>W#F%mLYCN&chod5ugtBy;l7_Fs)o?)liINOcWMl>k+Eff`B`?F-0x|j
zFOFf^ik~`9^d}OjMl2_$4IQ_fg7d)k%)vk=rupiVK>6APzItt?*fx8vtMxnOITwBV
z7J=M9z&tPuf8=#ll4P&lg00#)F>+>|3aRTeg-EZXs|g*pSDm+0iYg<;RV1<m4OC@@
z(V_yZoULwvM{E*TaoeGg%xz|v%wK=)1#V4A3(S8-Q*BtNQ8>sdR)!Gq5rCRYsw;j+
zExecl|JOk9hsmo^fP~(OBP@+E12?Ty33vKX(%3X?FvJK<X<cKvbXD-aWfU^V7t?lN
zQHQ^5z($dhy)Qlp&l4#_+Uea9QE<{-M;|>s(T>o6AE%_${6`A7eGdSD8ro(u3aPyy
z$9S@=AA;vGt1%mvBg8TJUqm{iNmsAr$V8y4tbwdgL~?T34^-Sw>zX85d{Dv`?)5Z*
z|05dxKqruq%w280J{BFE#d}ue{G-jDrL{%Rwn9(N8VA@g`bqXIPfOvCqz0~4s{MBw
zC6+1QCE-y4^pJN<nJ0p&DyQmi$G@TR%|tB2PUnfd|C3verKAY1LqpQ4xzej){DcC6
zc%=S?kt%gq(If-U*eHLh1jz68WUUD8Nul(G#U$2nU7%wB>AgH?J3Q%-HJ2zq2qOO&
zgZHLA62B?6@m~rFzeBX2%~HBRA%5nF()_f-)>0c0C-jpOUi=>th<oQE2`3;wk7Pvn
z<{5kCOk;b}Rj&;IzaB(`C_-dhcqYx4nXPt6Z1)xX;RC|!Na1FmE|6`P;i`xd;!lKa
z4k(59dxr%Ecs>wg8>ce)yb$oPO85E^I=LCw>cdiE!(w1c(TO|AIPnElv>sQctqxm_
zTX2l8*SoUS&G7D;*Qu=Wc11i1Y15Z>cfX}1>q)U<62<+2H<fA+m$ga4^6@uP&a!_i
zU~v!YhVcA=AMgOmP|TqJ{X3L<gm$Dw3q$xDOf-VX!@F%y?nV|I90e`wf&23|BdEFb
zo+=ba&!;)GYo?F^LSw0D|Db1!;m_xiR<+rVg>U++=u%G5=oKMU8A6}o!bs;__X#!H
zJeov{NR9W^$TatAN1LdRiNFf$EU~_~v+=&X@ljRA@l2Lv&;kNr`!U&T$A;o~oay{x
zrXthw{))}b2Q%!0llj<N8Zz<Yc!`11FJ<q?ZV51E9CnU**=;x_1g-;-<dB7%EMf7i
zy{&SI&8lH)U8o?<pn9TRz*IA3&vD#jslqY5DxC`F?3n5?oF)_B&6yQoN-SP?;5PH$
zhhM}%ddXsfV_>aga0gsk@Wnhkkd~`#=(PTtuDY!gFKN%Hlg2i~@1o}PxMM|fZ)jDE
zBJjsv_4^(Ze_VY{y^-vKjj!n#4uS=7!I+R>P5%;&#Z;5Ma(@>!qc_4O**RU4IIcE(
zeeNF`F~tAFlLDvX=0nkO>pcMDlyrw3l_i<6p^0=##Y1B^5^4NLr`D?(<MOSsU3s#!
z)qxae{;8DW_xFY40?V+T&sVpLAIhfUwB--p<qJmMr!hvfe1f;2VQBoo&zW>7z$kq%
z!ra33os9QQ14HDkL8PI716+LhDyxHS0&(IY!%}W;bgx&%DwF1A6lty)!Yb<F6w<pk
zZu?s8f@4*mw-F08e?IK&gdmaKlMhou$k|@P``-BIY4oan-wjOe_!}!UdWchz)a8At
zUih=%i@tAPM|aRYQM-*8m1Hnpv>8|BpZ1Z?+^6-}_}hd_hb83xh%zX;f?8epsn;bP
zGPZj4#<SjbC;CgT5>;%iDI-C$Nl(QWOg=akYXj+3#;nm_VKL)m*8LW>Wa#b|(gdp>
zC2K{+BAt&g{>*(Irm-X#&f`~np8au`SNutgJTe0EOlbmLD7<huy_ezkRz9{wh^X%Q
zNDDdd+Z&HtX+xnON|vjw*`&PM@{X~@{noaDoUb*~=~|=rpc6H-_^nSDl3lSC{2@>u
z5e*N2XtDp)>_O<#gF}iEn0<6V5@55qA^jqYY@*TiQP5M_VYf?Td(iAsSk&lyTD>A(
zU^*bacvk-qP1KdS^sg{$GilQthq&S_>Va>1(%Q_UrbXjI%f>wSL3Y$A5*!RTzqNF;
z3yT{@h@F-rA)d*UZn`3S2Pje}-)@P~mWbizqh{Dvt<-)(bp0$t*^t~k@Rt>XakR+;
z#^2xHuaCZIZH6`5&W^$HWthH+b+c}dn>hmAh79p(mA$5TF7i+Tqc5n4YhCQ;nn&Dd
zPhcu<>#}fHqu<y{=XW%Wzf!hA1XSm9?|Ne@ps!rg|NMe3agGz&C|pT|-WZvB!M~qW
z!brt`Z64l!J)G++_&XN-AxKt6+T#E5g{|b>IE4J8!RR2!K60Fr>QCRvrK2&U;lV5_
zI?JOE@#BI0=V*rQjb|UZV{)nC`q!mcNR@$R_r8LC?j`=!k=Yy7>c@t8I`zhYep{;p
zy_)_SMNhxrDk2f^j_6ezi%H!}|2igny@rp3l#480<G|)7TSv0w((Ax`YzA$yl~0k+
zXQv`oK3CdbETQR)sx(Iil<t#OSz|^EdD&mRiQs)d!(JKAAwDykK*G{V7K`PRM3Otz
z6QGOqyh~&%I^KzS=J@bi>TDxJt_;l^ra<id&0QFsQ4cGA<_CJp#f{fSn_rKo^i)VK
z*<Z=C^+(4i@-KERd|oNkW9sydH8EGAJ@G<yedFLm)`X<j_>j`-!!k{~E~e`f$DSS>
zFKBh^)il~2=pdh~tGZs|W{O=myS}!-v{z?5iBq4R*0^HCbJmdm<zVyI#X>T)9q<l5
z+#Og=u0uWR*6`8)j^rgd&^KypmhZ?3JnGB&xZy}KRCR&NkuTG*&<=Ry@5On$l`M2`
zD3&vOlsM!#n|JwktRG&j^A3viW)S_!aPZ_=MY}h4(ktvcVo#lUt|<Xe{m#i72V;9)
z0CkpM`iBlGZ8BUgwv^S@^)Z32@{1pG9AoQd{gUSsO~a|&;rbS;UAEEN<RZ7yUbhx>
z4BjL9Cm;9WHroyB#rO{=fBM1zeL{84$8wz*3V!ZB*12x9hpi(41buXpF+x8@6N@#E
z^=%y#*L2^CEp&mYyc6YP>$2s#9=?`m_;(RSTxF+X0z9d=pc3pkXZxy6^J5<nPxaQB
z><%$V&~?7G>QsP!MDYZQ6FX9$xpFOX1wNtx`P;wWCFzn6O7?RY&%G5TtD@N<ROu{j
z-2P^pm!~1i0X~VG9XC?~m^=o2d}$A_+~JWAvXEbPEd8R}Hlgg5aE3>Td+H)@hRbjN
zcAB4Ux0vL&_?5_*D&3SJ(=62>23cJWUwgXsQ^RkBCUf*uc5bA?b%FaV7{b}qT;ezI
z(IyrFWy>Fy1@A*-5<)`s85X`~!>2L}?|Wpgprf1*Q#7pwj9XBDw83NvwUmb{>J_ru
z&=vR=h%-XWpK@uFitEiz(6pOdl~{8fH5;_ak*G{5uRC0-eNkJ}jBY5z^sBfqZcYeJ
zZOVjL?;%{$>duANmSn6T)IgqX1BVs6+T9nyBtNO4O4|t1tUbCtmvZZ(RCv^GZGCB{
zKi^kE*tsLvU@`8>g(3H*=*MpuTq*HQ=L8I#)8AXa{FqA%f_AulkVULsH?_QWKvT&o
z20t5fB-8rpIRQ+VKN{IEtOJo!#Z=*9q$qPk(;e=uv5HT|i$loR8EHRq$_}LFakKuS
zj@#Dl&C*WID23(Wza?NxO3=|Ud#E(2WA<-R$}{6S{o-J9;N{cw%z5y67fHQ3I?*tI
zv;FX)-Rmck<FJ}+Feu5M@0G+d-KV(PYMfIR@FnTiM(B#|n_dNH%vE&wPqi;uSzj<(
zEccmxbt&IJRSd!NjR3n)3LmpkuXmeb94^OWIpbcjLT>_bxlWG#hSS_X>~wS8zMX(p
z_8l6HX%_|_7^~T44*_Ky!_Ka;M#U!c_JhH-G?4Bnr)>~Ndf>S9J^W4$M@k1fNn}Ty
z5_M<qF@g9<E&0#yLq<KltPvI16MF}aD3l%4RR_Oz19_j~7%$F2EL-=6W3o$<55r$E
zvuT8?*rq-2B~_vkpifk-)SJUHxtWr8V>RiE5B$WQ9>}rAHWUW%_OAq<@_%NT7=dWp
zhFbLc4es=p04g;!%u}(eT(&G1dzqNJZ(iW8dedN$Za@*4k=Qk)n{V$mkDi`!+#QcI
zD?%EnWQ#j`BW({9lu)k^0kyApSGRLS`+LEb5{nvNi3LY=u`dN3!XjUsw5d9?>5Zp)
zB}49N9D0;TTWt3zApQ@>3l#(2#DP(U*=ZFm_E>d$*psRNad&+@$DIUaLhX_GDlf^X
zhYFg6hwI*SU#iAPa-SBl(L<hN*{P24V=zt+3yE}IRb7BfB!Psaq2O9!T}?YN%J$lU
zF(!^W?sc-*QEFFw%tV62Kvp*`*dzLF;f^GD?nQii7i%u%GnZ-9Zt5ZqlT|lU$hWQ@
z{j+R@MX0?&PA`2L-y_rv$u<oRr)T&3qlS#yCqJ?<zzax*>cM`XYFp^b-b1bQaKlmN
z&aaRbO)jO%UDW*9a8ED3Nhdki_$H`Cqe`E+Olrw#Rl&`pQ8!1A6Cl)p+=ju%Nhqf&
z^c%SAvA6@=eT!?4Bg96!jL9;&3b&`wnOThH^_t_gtYo+A-Wx*S_Bal3X6V(4qE3b{
zA(!2jBQ3D(X{_s(?&RVT$9n^>uh@L&sy{XvgMoMQ%2&9JVUsa^Pt#CKSK^=^p}iG#
zJG_7LOJ_16zvf7evFfAH#e0X&%H1({U6Y?0gWvfXW~SE;Dtf!JT;>OuwHLo(6mPnR
z<G3amz+rYhVbEg76cpoPAqXCP+x#dI1;)<+AD?7`Ge*zukmEXZEP$qUQgIa<R~Hc-
zUt0~53Bgv;U=3^JxOpmxgHhVT?08!0nF21`?4(NZI0^~&QeGYz;7MFyT%}H2!OTM;
zPBKaC2&F_^nL_~^?MCv%9M^rDmqU8}VTZv{gi+E+S7`!+6y%LKh2YK>;guh@@f4#H
z=HmgZpB`$-wep9lB;sT~uY;26_wywOudMcFI<W`}!Hq+TbqRM121ltf)RciNvNjJj
z+6hj^CWAK4)P-z_8?n*<hq-qQue52}Mw84$6Wg|J+qP}nwkOF%6HV-7k`>#|if!BW
z&V4`cdwke`_qX@{cdeuQ=&QQAuUg%8)>$>rxzlRAtuxbEgLg`w>t&!_Z<u!H<yTRk
zyFtcxYo2OqtE>3ZdAAcBH`A#M*lqr7l72b$8Hi_;zfAggzMM^}-9Rc~;n9$?;b$|z
z+4zks;aQApH2||($g2bRi*q~k5O((v>``b^Jb1@_Tgm0Ix|k+|RntXX#i@NUoG{jq
zG(98!W@{kn&~+7l_i(H&fh)6-rtg|sL+_VTPua1i^`~KG|D|!XMQyE`Blq8qe>J`D
zO98m2V@0tX9MZ3yPQ`81656$Ie^j*@qy}e_XKE)I8pjb<BW994Ht6J*oz?3klT5j8
zf>E>YT3tuKR8^R)9w(i2;J4Q7bh;^iys7dcU7*4FkWwi9^?Z?9{8plzbTU&!<(K=@
z9rJ5dXqDwsOk#ec@Dv6ROtZ<{b`?@h`W>xp)}H6JBuG|B<8^iF^NF|a_K?lgI*>`#
z9T%$Y8W&Nu#U+tur|9FQkWPHoYjMBGr+nyrjaH?ooy98d-7s8xGJXO*bL7TtvE(6B
zmWc<~cXNzP<#hmm984-T6Gb-Dtz=WgBIlJv%I7t;9LVn{g?%)h{2MPpa$2EKhV5%R
z42sMoBeWCZIj33yWk?G5&@Au-I_ZAN>p2!5^<fkSA#nG7xJR&(xyRN&g%-GJ(yL0d
zC5erCSp~D2#jqIIS`=vv$)&LS&eoM$!vj<Dll7C}^&IzR!yMw`?IhN}l$Y=F%itK@
zO8V`)m0Hx7;Zh<0S1_Lg=~wLC*JEU>qYjxu+{{<k=9sZ^Xq6%j0gv$4b9Acvp2Z>+
z_UwA2-u0Pw1iUdiwZV#gyRl?+1+AS@2gpO|j-hv%7j5UfD%VEpn%|$i#Op|}q4m2s
zQc0u*^(N+(zg`H2T5B{C61_j2BJLW_t=A)SZu3kNX=iFNk_xAB@XQ>I#z<zmG3xNH
zZPhvC>F|i)^t6wbmMwo-|8`?w`}7k2-O@)Xm6UmhpFwk<(=66H)g02sZTza}UFVQ=
zx5qD`UF6^)sbV~+D`#WE_xYtKvo+-7bU;BB7o!IDN<N)l)i*ANxS#pCl4<lyq*8T9
zg#ijtZjI1VFW&dxt+ryk;^h*?Q@j<{wo5mYlvb{XG_4LmEf^s)Yiv4mGw;|PR5mYu
zDw|MzMR37D9!P&nPL<?BKXk8Eko=@~tK<O~$D+RLHdbg)F;{=tooe^}hxF=MWtS^(
zDlZ_B#D^66@luRQ*85LFD1rLh{gkBngLDDbTm{Y#Hr5F)Tlca7-+ukgDOQcLnlw@;
zxgBpb0^{R4SxXQxN6fNW6lkk^^Q4yM$Q^5zMvqOhZ+|vm{oT^9h4&3aK87TyJ_ZAU
zmJui1KW!q<lf_<&{Z!$iNcg3};mq~|v_kYf0vo{N9qrViy1svB7x*2!;UUI_zimC$
zRGUDs&L=a>t1E2P5>y@@qdFh>EDXJ}^)Y^S3?7*8qk-yrCFCanSBpL5Pei20lv`)`
zO<EyEQO!Sb;`9c0RjY@k6#lBUv;CvGTxPgJoy9lu_9C?xhfMK^-9J!5L-^8ht?cIR
zGI~Zg3JEIVB*yrZo*h}D_YY<-nq7)Z1thSVCl>^Pt@axj-nMGm&yt#;cb#c4!(C@T
z#55eif+(UMwH31t%!gG2o@y`&(WzNgQMm{fdHE)OpdYm9a5rM=Ax3Xs2&TZ90d0UO
zKSr^<=?&u$iB_jaoP{%>V-k-)-eIS|LgP=>5SK1r<PcjNjD69o9>PyVSVUpf8XYd(
zbzn)lsq@>Lpo4RJhasze&)UF`;p^M`0c%!cGDI0UI&lvM&SBw5oGn}46Gc@Z?{U?g
zkpD>VJiIQmar@o@Ud@)lZYCgiJ~Q4${1u#ydulcBl_Fv!Hhp-s4Y!GSZyhXg^_I)E
z{ufKwZHD7;1{^w!2z8)&!t*}vXyNM8Cu3zM%Vx;_pY;Kvb;2Ahf}LTJ2~Tna-JX*I
zGu{oFLN)(ZdK8d1pXt?S*<bCSgfHqf`>I_1<$5K%S((ka^pt&XU4Y?4P|ea>3Fjb0
zWM$)}I!kt#&YWR_L0C8b9dK0t;bVS;D`6iC487u4*N-wYujg`+t2Vw+O}4<K9BVQ*
zG;!SQ?W(6TA2vNB1Om(uUk`kXGZ8FKTf~?9q0;mm;e*;5$#iU&iR{D;gu*Qj3;qPg
z#E~uqqP*kOlBm91!Crx1fir&IJc<&ytKo<qO)OPCBd{^r{o7zP37bxC9=l%(c`HU9
zOP<w)Gh`ETq(k@NuPCP9CZXB-1YxMc9nLdR7pKRQ;>aMWvDesKkq<XWU>|ML(sl7?
z$+=IA$317aW$x4W3KB8km!K7NBz3|bYR&nLKj&-T9UV0;2GH^w1*_AS)o)y?t%4am
zwbQUq;2Jj5W=6o<(N)4ykXLXy^MP)+ulc^lYfTB&tB{jEr(1KiSL|WELVSueG@ijk
z`J15<XtEog{GC)__-anjwN3r!G|1LaU{nH183WPjl74Cc;^6pDFDOI4go2Um<Z0BO
zQ@`=BS51G+BykGHV}2N{8i#`8TK9xZML1rNGhhJXQ~)j@`K{Vy^HS!tz5z+V$82n~
ztF^VmVv=Y)8zbbw6demkJ(<my8^uutXm>2h-w+QOC@V5LUMu<mMYmvC1)hNmMY=z~
zPMXCx7$3)Q-L3Oj=f9Y%!}qB_xQ!wNn>lwj4cN`l#biWknr~kIxG3r9I>@``d+Lj}
z@a%v5@%=N3!{Qw~aA`9^g8vXLhqeWcrptVK=NiwmYn%FcYNYV(l<a`d54N(ipUx}C
zNKf0rjO;OGlSP3eKJDX%Ho>Rpwhc07#Y>1c@>iGrL72attun&mYqi+eKmQ(!XR0v+
z7vOd5Gu!0!2lOGH2L>@%3*PgE9M4xx20uSGx;*!Ks>VG%@RHdaUks79rCe}VL&auF
z_DEai0}4w%-N*oVc6%8`=oPZ@7=#3@v7yg8T!r^!09VRnNZq#F_RZYquGs56{H5M<
zDulXQhRa%JGDO)7+;W0ZV~h4ay`m;D$wd{3YQx&m#5U{6u8iTAfVh%-l}eY{4HNra
zgjbA0lA+V$J4p33CtHCI4&jl%;F(wpwj(X4@O^S^FXQj{)iX1AUlAlstN74PT?K?t
z+0jog%a4Lc_(9rfV&1!H)0J%L<Ayv2y;x=jNye&_b`YcVOLzt-0Gz24T1r(T(sz1L
zIXSE%b+kl3!beGZ@58e`RXvaUQ<G}X=1qyuB6O|Xo_?FRKiU$@WEQ%YinDqeg6d%=
z5#<1GT|Jh^Ut?FSnM@5I)GKVJq2gAI`?zGAN1Dbg3tKx41r>W6lzi0FdVfVk@WP(8
z)0~>^Ka50QEJwzfId3MkLpD6WU4N5394jq<6h}#uwa&s=nUQYVUt?NCLil7fev=-4
z`ZM0L5j=84$#C1k%iMML#D{s+ySOSUH=CWf&;?qln)N=)G*!kqnChn<4i^JzgH-|}
z_w;~08VjC)tl^(EJ8#I_kV`j&YoM3^hDqRtCGDJ6VdUpsAKMw#bs@MGe09&Fq__ul
zy7nnUQF2@?g~jtxlNzD#mwa!tQ0w0#*_*vPJIgb3o!yXh@6b8I;{a^=K*0B{OYZB)
z+0W+K+{?a?@p`)u#K1{Oyp4(?3cyqd7MBw9$A9r2!rlym?ngT_-L(Mn(tbv}+#``v
z^aAI3xDIidu*cxy1*qczW)}S7g!E2?Xl<M1v(Ni91&y!;+(D-IPK{@-dm(CV$#l8l
z%(P6QHaD|!86b*!gBd?SXa=l!M&htX{~A6XJ7T0M^$^r~tL$QQe$_wF1VtIh^$VN+
zlvl?0Nu=n#8Gf7j@xZ{Ti`o141ITGP$i&B#pmE5v83RJ?JfqopNMp&8y}0{>ZQla#
z*_Od%jE7(+GhMbB_9uez-nv0zCvEnQ5HzeS69zN-flIug@zD5o(I#Y)OD__^ZY@IX
zhF^_K@ELiBvjM7F%G^?36?ZU8tc$h;9+Q7#7+%pgDf`%|8r;w84j)XXXC1G56brpp
z(?7jx7L>1SPSfHu9t1;BhA+Wu3}2+ZPEb&xCqI?Zv@{SEejXN?%$*bFj)|uYzS!nT
zkGk5a+V-6*bogXJXP?Un*D1FqnkL;6GJ38cmrt)4*TWF#g&*GrGf~`x5dBGa<R>PO
zOU3Q@MF-1zWKS{1>PH%EH@o7)*s*eMs_$^PzMK!Wq`vJ5`(yGbh^VpTQ<F|ZhstEn
z5)XB4@V4eWog(<83libPX2?`N4|Psf=xfv>)BiYn2J5dI7_qkPAE2XR`aEo~V`AEn
z5i5zhH?{88cQV-XlO}M~e^y<Q3!=ydt__q#3BR(a*2b~P$Ut>DfX%kHP5uz;QW|H)
zH(dRVc9u5s)2q!ZsFP~N2o^7CEZmKTa~oqkFb~NRn_+V#80uVUB=OTail9JfeKz3^
zftf1C`eDi7w%y?aiyHy0uY??6@yd=K0G8uS(3>otctY>LsGIFzI#f7e^jID8mB9ZN
zihSP)?#X8Dm&*`7z$H_mYIi$tCy1(`x!r_8<^wv18H@T5lF;>8%T)Il()v(M2!h=A
zq&G)q9E`aAjrzp)J7AW2*XQ?Pro$Su(IaA@+UPOQyILw-ipaA@VU2$#Mg?8^Y4xAq
z55!dKe>`WkQv+TpsYFEn;`4o-T*fqN3RZ79D`U11I=GQJ$fyFahk#xtQO_QfBFaN7
zm6Jw?t%N%-YBv#12z6pqkjquBi?GInn+J#<8k8-!_<I}k@wiu~_@jR%^!Kjg`Olv+
zEx+Q{U_<0xU3AwU;qy1sOEg)_o-ro<Cx!(?Z->`vc^!SAIv~SmbhJ)2i*ouiYt)%9
z!)jsc#;$+FZbuEoV11rR>uc6WmOXaC>Fqs<eMX_li0`Cz<*HTeujL@i36G9dG6V!Y
z?_L~=wIFv*q3V0TCyeVg`~<(XV6r43L@-_o*6?aii+QVb*CRgy8I{caWUP=LWpob+
zOiO)F%-Lec>$Kmwc`6T^8_yk@!rLNyBzk*fsPj+Cntjzb8e^tOmqUD^tsx!Vwk0pQ
z#2iGCvp^@KlGzAa)Zb3rb}+1jd_I4?=?}gpOm0d9!l&YaH077_500rcDZj-*KW}kW
z29(#H?ZN8aEDZm5EWq12Vb1S2qP|90XQzOn(U!OD_I68X^;fj<I!t{{bhz|kUTd>q
z01&#mW5&%JYbZ8wSr#*O<fj%*#%Xmt2xSW9<M^%zgYN*>YfVP`nc1L+%Uo4+DWFo@
zFKYy55bz=Q?e>E!RS2C-U$5FfJ&%m5@8f}*OMzi$gUGpc9bY>51>%D94rdZY3E*iu
z%;YT&XJWo1*bZz;+?dQf$KlQ4FyLUkY^jB9H65sUCCpi7j95B-8_p~@Al1KJs529<
z;1OC}u3p=Kc27n(c#dgT`9NlnZ_nL#CcC-#=4HO@0vuuh7dncyFLU@=SWE!?WT$H>
zx^9}Yt#aw8a03e9q+!8Ad`Rq{l$U&6QRK)QWjk`mrMM~;Iyd~_Y`tuu^=0(Ue0MQv
zEvQrkq_(q5x^^iN$7a*7OvxD9IfkyAGdp7GW`~kKR@wUBEzmk0?99a(15<WVRIrM&
zfn|#kZbv=UYhfnKh#eXCCb)@>2ekV^Eq~8K4kO>o^d62a{rww%2~Bho--}S`v&6(D
z(s-F2Av`|s92lInTG8W~Kg^<y>j{j!8*2T-!lmk*$AN-{{Z4A%vY%S~92(-1v8!O;
zQuLfdxA;CMzx)-)BjNY5E+Qu8lVIAcu9q{r(-d$8$1783A&^k7dBy%wv!>{p<@AF5
z2ASwQM69314YNt#1g7y*qH~;Px37%x`RZnN_DS+MB#TBf<+1LgCLUVO=emaeGbtwu
zT5}O-F5HTsuMg5G>%XgIY{rrPajCcZtflv17PBefi~Ul24aq(8n7u&P6Ju}f*{hIy
zaVFL3+HyV=A5I9xA-MJhfNdX4;hR{v!R}sKx67Fjdnn+q55l5pv^hlQdisAsXzxF>
z_I??B27YFaTuKa3ygoDAl0xq&kqRM(Uan8Hs*wB#_xT5rdZV;oC=~a7m;Dz@{_605
z0Cf5X=K4PkZ2iCSj_o1mvdB9UZCe3%-q74y?7GK8<%KJgArZt*1A$DbxN|@iFJAz7
zjDgO?jv0;S(~9~7>VjXBU<2gWv(g<%+Rf6_qZ{sA6W|?(z-fXUZ}~C8n8A4WO%AuL
zn#5&(%$C>8L04$Iai914aUFGK6?WUA>a5%Pqp@h1sk^z5jHl%60G@sA;B)>y)4(@v
zeB~5+PWYj<Xfd3u{tIwUfSGFJ9%-U~Jb3S(fa*$SMmtP0xW)rxL`{4)n`juTaM60c
zQLkIAx@F5+YK$Utur)S0Kv>IIxzFOP9yoim;%~?dtEj61$TzFI(_$n`#~sQ+jdfth
zS84-CX@|I!0zC0xJggiz>wE3SwEX+Uv>E{y6eU6mFLh{VT#qIad#?u(;!2|88y>hQ
zZcC3&@-Ie{E{*L~yd6X}WfNaFvmqor-rOUYI;u+hH!|Qg-aQ~=4rGcZ@$ln@^Y)N$
z;FhF)WHT-7!5ogdRg}^B#B!(Yz58{;7GIwd^@qKRr_-vPKII<skaI=qTTKYol!(Md
zhJk(GvHzo`IXr?3EMvJ0=S{{%I5%6<+J4tnpEV`LW0zQUjczOL<0-ClBbtASs>nrR
zvhEZ7G1b@zY1fLF-(!ODOr;X+m6cDxYFdVA`s?jn*73Nc;<~4(eP9)9L@WRu+oa|w
zdAJ}Ta-V0YSHHIw5Cj|l)cFc3U_2B{){y%s2VGUQaP=+^48<wJM(o)~0atzh3Hi=1
zo}O(uxAMW6-=S_?Ldy~1D^f9#O?WQQn0p?1Qa1wj(eoXX-}pH^rjVB-+7Zi;&iJ;%
z>n?{c@yVP*>2^6may0-udmcWTaZnad$n_TFT$_0VNanhMitNk%*Ut_qb90oG(2bjl
z@Ast*kxw7@4<AuNo|=;K1LDRhQ+ynZAY_;ogooO;DX_HBY3wt~R!hv+JJB7#8VEaT
z{?Uhb@T1DD;{qb9H>e#o<{`AeM<Yi^xFWQMBRPIjS$VQD34rm#6%peUPLXbSi2|au
z1^!Yr3?+Q@)-;cinLc#mFq8!VpqT0wX-8ni#1SZ)-90|Sw01!Kt_N#+$<@u2jdeog
zxMf*PUV6p)zhhNxIpbA6BdmerBH`Y{oaF2Ki?m%aOYN0`Tbx)H(I-=|xsuTV&_3>F
z&c*0wPg4$an#$f@c>?Y6ajk~Jrs!t}F_^a2;h#p|r#l1xLHjyzxpX4n80NkuN#PYc
z?cHy$+m6+MD>Ta`57$o{=>olSCyGr!oSJgtbZ({=M>U@Zt$HsrrL6`2f$cmR;Z?p-
z&O628VI+ueH{WEaFJd%o!?^0bu<+MMx*3Sq8fv5d2y}_iY}e?CT}Gn6?FtLmj{a=R
z*0m!m=2qO~0VXEIndK@4_t-;*g>R!|AKNNv=#^+WS@XaESb0A!X-6UWj>p)Qa6dU+
zP~N()kj6IIS&ZzNiK@8eXRoVhoib91O3P!G0K1gW^X@<0$h@B@2SuNS?C5FItfiiO
zsHd56-CPnd5<bXkmj$Qd`416&_!pGT04BGp$;@W7-@w)v+dxd9kK5v%cY^VFeK8ry
zQ+_#Gv}Bd`t2+$ceifkhri4%vLCXB(fFbt#yST!nzPQ6<$f|N#n_PPU5|&15sZqVF
z>B;OOVU4RaV_~3kA^oQ_Nu_PW%*2UoFx)U)!KI$&w#Bv)k5ejRB!1#TZ10jh5nD;5
zwe5;#<tV|{{s#FQq=8Hy=Lwav57+9T4DdsAJ3sr*dSSz+0e8*k7w@;g;3cOr()&`9
z&u-B8tf~R?hTSg_SdxUV3ggG!R!dJXfcwD=lnYvl30b4Dpy=_oE8^kssy-V7j*FD!
z10$6D?Fs7?491UlMC}wut)wg-&7%WML;3WwCbvUuo*$F;3+jW+cCMiX^q?=X9!qZ(
zU#R&wj^7~n1ENG^USO}w{!*U=(_<c#>EW&97tB1ZJ`&#NKQ4cA!VR|o#$G*-jBiHt
zh}thNL&j@?w;Cm);*_LP#7236KlVMBm|6apqBdUtm4E_i9*!aX?r`<v2)~2Gf|y*X
z2iF=)4C*bY`q^mTQc-vV&M4N$5a->eH{|<OiRAr&7Noq2S%b-gB8&44W&V2*08%q+
zK84f(uVsMyt}EL3V0XlFZvuO9k?aVpue!12gt*CX?O>Eq7zb7rYjm3Cm4$pz++QE-
z1*YLUe0FziPtcU2i2n;{JgPyz?*Q08aXrqwM9eT@uT3hPmh>zyEj;H<)A&{1%pT|a
zkqf~VX@p>ZMgF)nu`e`SV3f$kcJI|ywQ>`9wwx8D$h{Z7*0%}_1y3k;=RC$I8PJFv
zZ=y}oAg)kT*p@WjHqpYt;Kzv_%MNL)Fn?l}^r(79PC!aGpHcMzWH3!iNb?ihPBZCD
z9hO9`EHMv^@$S5SCO&l7R(B?EsN=Q%j#2dzWwi1p-u|KcNRhZP0X%m*f=xM+?IDF*
zR7}wb1uBm0<45tb6OATv*q<PFVteZ#DEU(5Jkd|6oF3qA+Uu@hT5w{)z}G%}!&75(
z8}=q+relLhsr+`xq+t?{AFq}+X$E)>nMK8+rF@NZ{AH8rmS^@ciAn6jC*KqNFl;{)
zuVLIakJFI5|NCx<+Y>&efi2#0*LP+K9_G>3_;DHA7Gog~ragIN)?~e;ql)<x{KZSo
zS~{W5*$kfD1r8%tzh416s4g=$ujh1_wjOa~5@t0G+;Uz?){;NH5@^<!%Alx_4TO!u
zU^jttH~6zM;k2^4QGU3UTHyGps_Z@b=RPhWocpek@c<|*ZbWCeJEpa_fgz+ULf@9)
zlX#P++Pm*&YVR3QzL$!EUGB2nt|oJNoWLSN*Y~R_d*6ZJs}SFJY!DH+=m-@rR>km-
zD{giFN-`^9O#ld(R8|%&{uUSWyjO68il?-UV6Leu5;_?34@y=E#3~S%9?%6}Qj-}I
z1*Bt>A=CPTFM0Rbj++YB0Woh|PUC)*hze~vcWjr*UWcyU1e2mT-0{yOYKGN)w2;w&
z;nL%=wXS}YDZ?I9w`mZDl&h8@T5|!UNQ;o15SzN6aXG+S>x|vRl!*+>L(&3}*!vz{
zZn9L6jNR|`%pH-#BSx09v%`PMnIBsyf3fo&pUy0;nQ;RyXOgaw);<@tU=^2`=7W*H
zcwn|_yebl@E(e;7fZRw4=yU};i@uT#S)U!M-Zo9jjfN<=wl_^iF0jBl25KyR3+^&K
zkP41O0@GxhSmGXxdI}lEhuqK~0$|l6p9^Qh7n0@=m;$_CXP(zV{rK?7ogf_-sj<I5
zS^dzr^}4gZl!`?rDzDhI)U>4c`!y_YLq)ijEfusuyIxP#Q9Xu>2x3)>u9zoWmn9EW
zaj?HtUgm4Qm-vDLhSay0(4o<~?41p9AwmGpKVuEsx-x@yO^;-54^UvH?xPPiCKHg_
z5WIy>dY9ODxV}CFEB?vppA=|02i&sw`spz4fb&d7{7e?~qKNDrz0)NZ;$Vr_ZyK~~
z3l7L-9qrw^{Rd_~Tq_pnRI?~k^wh>zXHl@08!)Yu9`%CfV%2xK{cldp^*)QWJ<HXo
z{@53}m@<(yu&ZgWn!bdPex~`aS8W5W7YVILT+K6b`i&kuf4q3Cl;Z9R9k>POe@cya
zj3FohzyY|?kHv}GOu;1wmWt^zl0N5z8BDWM%Bd>a`XPMGu~IA%gk!6toASqh2gDLt
zMMVy3=6D}UFj1GN#+Lfs{cu?h2LJR5zLd*-hlO!<k!3&)atifV7k<kfLSEjfG!c5W
z!#n8-itln`Q*}#ZI@qQv)EzZ{E=P8SLkfLVM4`PUG2C%gR&|eltaQl^UvYMw-~WD>
z-oHlb&@J3p4Uw!oxD_=k_51<?v&koY(GNZDNu2TKk(T`45Eg2W&b+xUX7uzuC$qA5
zHhpY#->qLBB+QtxcBnpGT)`+7`)}G$HLQLI)j*;&xJ#?3Z2s&gK`b=_TjS?#WMYa?
zqnTqy^$=%MYJkdJA;adtVYn$PUv=?@s!=z<C#zuXYvI;TIuk0GJ(<1KsfoH~>imw?
znOo)d%LK^sr|^ug+qf(t;{<oN!^#7g6XM`Sh2SLAt4InMQ?cGVWqedsd<}81G?3F@
z7cs%Gt%uj@<8>v-Y)rC6ZvCfq|7OBfuawUWJa_*-tV^(;QP*D2HhI6tU3Z4^>neNO
z?G)sV1Gm+8swc_V{pY|62Bw^2>`s8ys0}k5bi)2}5RQE_34Z-nF!CMbrB6`}x|W54
zSzKS@U}YZn*_d(L!>i?7L3PU2J{Jk<Stm95!HLnuLf#aXtnmF(sqS8hcWS~9Z^rX!
zdi|4F``c7yfcx?ym@6Uu#Sm6}tC4yr(`^(IB@QgEE+Fch%V_*UWskh3_xDQ;vxW(T
zMY<Zf06E_SC+Gq10=*6GV&jSM?b|=rWMrSy?CROicKF(S<}6C|h4ZCZSZ@r~>)~XH
ztNwVp?vyu|b2t*0B|h(PqleKLJ-HW@_GqnWy{3w}E57`l<@)ybTqrrSk2Y#fU=?wK
z2YXTC1xL*Gqve+`8bqawGFm|`zs1IOR!tbh#R=&juyKwPVj-u~-);lf*5_6Ghld?W
zNY5L|;ChumRyogHJf|lF(w0e&E3~4w6An84YLI~zW9IpiHs8gxt8GKU(XV$1l#JS#
zw_8qfHfgl&pm;aX5B_vFbldv&<>Tc6t{_EEuGGvJPNg%6Jt&8(xGfI<jI5^_Vb@rU
zq7vyZb0c4vK(ZlifOW7-KeMSQ{*lIZ<OZp!vjw$b9(E-&#aGS{UDdWEi3=TK45a~b
zLYa>P|JNuc<MlGWAa_uV-K#VT4{^yAhC{*N+iTN~C-U0h{tF>JM<7`9guqV_3p@3D
z<y|l3A@DYYFazPee#&3?fw}|lPC{>0f-zoy!<YEA3LRD?=6`bT6s+KItx(ZT<(~;;
zU(ti;haw!uIIAHQ!6AJI8Rb_JWzV7AGlGrd8coIh2K_jnq_&o_WGb-u93a=)hP*PS
zRq|F4dFB+^d+R>lNE}@;v9#fr1{>(j@-*MjUbX(mfn>r+)Lmc{t1A1-nl9%ZMKPCh
zE0*mCP+xRb;hdP+uCVBZ<ffbrmSB&zz@Di;0)Jo92o9mYAvs%qR8XKEeAzUIuB3b|
zSW77cOLu_>mTXV00*k-jY4JxGhxDH-RcN(+jOBLvNu*LV-jJ-xc%P>~CmZRT*xnwS
za-BeU4|iIBxi{8+?o%I0NsaN6tn*P0#DQfnc>mIZ_WZ0dHX&8EsX-Gd{X3U=0yT@a
z$+3rW^IP|s*cQTM+kHdywf#z&9}K7sEvCL@<Yqa8)1Q>ViqD#i^I#gTM=4CQof2VS
zjCysus|_?cf%+Qp!hB1$2`!PalfIbzLr(K45IpWWjCn-Ni$mY1PN%@jrR@9$ttcft
zh@jjM4mZOMk(CQ@w-+*MpIynizx_=usDLKsM=YIi$%L5o;u9rfsY`gAzsfd(82&0p
z+_yjj{6bciPr_y<VRWvE;+J;P{c-{J^Q*PbWr}0jL)Gxa+)G#%A@Ci_1CHR9EOmbe
z6v#SjC*dxBSe*VfktnGsc$_jNeh-3RDW=hbwsU8}?ly1SE^}O%e0XWbCkh<~4Tie9
z->D<-PxSw+rkv1@h2w;H`C!`%@cp&p0DBbcURwQ)+kJk)%*Xbs@GoY*iV3dS_O?V|
zE`Si{?qZQGe7Vwq)@@OpL@yg*<+NZ`x+2qsR-3;@J(;zCS>HDpN>SHmeVR{z_!Tm_
z#e-&*W=AR#P9$u%{jAd=b^5iM66|-FvyP*QmN&{4!|n}P49SMwJ;m3#FX&_A>7^9!
zJhb{dH67+DNX9F7XhiJlo@2}W?Hv89_K~CRS{#dDIJN37CxX}a8HU~j6N5C92gh8E
zxYssZ7S29Z32{!~y<V`An^S5<z^Ypw)hEuK{SZ}AnD(!4)dwPv7yMJ!$|7kvyb+BD
z7rrwG^7b<uQ|cKWwA=y|0yR={EII4>m{Yrm8Qd^AArL5;wP|NA`bYGZl1Jy%%Zr(4
z_KgC$67EgkcdEDw1F!}P`OkA@O*whgHYs;}a`uMf&$Iq6E8P(cX=;SL$KM3TyDlCu
z{NOaH5^AzyTMn#Rh^MN7a*<U9IO-0xvi%q2yD%2P=%Ek0lB{KFt0}i|bu1eLP2vXJ
zRYf~=+IyEX{hnZW(fk)ZdBO&vjL6LVl%_-XwKsbrexjhm_tB)ge<*4$?Wdb9!<Yb+
z$pK0fM}tsk|KpIytW@;pP;jFJtO<|nV?G<tlBp;6nBW3Qd<Kh&_Wo=`(}><JIrCcP
zIDk9a<}4I`@efzPuQdQS?<C~3PweYlo2-lu1CCoEiyfomUX>l{qrB<(l_nU_{%{#(
zO4Wi#^%8d#;%lmBh3X{YK4ukmbh?exztT7=%k-9~FE&NB-3*i^NRy8ehekO@SJ&5B
z*2DtlE_~)6PCpFA)V#=~-HgXto%6D&KMm*NmM!TdG*(kKf($>D6Qs~&JPxm2H&yea
z6}Ov1jGZ>*dYFW+=NXG@`3+|B0FcP%y9eol^NahQ4@a9f(maB3_}Q`pTlBy#KGlBi
z%lu3?@A01N_@2m4T)&?z8+L_49_`2}<WDDCf7H@3fNIEp<lQ&0rj@jjL`$*D`RRvl
z<C}|vO5{ok%5_lJ`EZZvZZ(p3cypfH@m(BgLFW>yAl>ffy-OC=oOPNwQQ;u!G2Rpt
z94}P$!u9ivEh}QL@9w_)0Qg%&{B&b_5+nd|6}L;x+oYf)2+X`}i0jp8rmlB<luhSI
zmX-zU^$pcw>?_nk^plX)FGoKI_w*&SGl8{m?t%*+J_2@y5^j^CAJYm7AJ?+%ovCuY
zl~nyVc&`WE&C?%y*k3oGzzK*Py41E)n;H^&JY0(<yNBYCHef{suTLvBi*6=iM#Sy#
zqS~cig31qpuOyid;vgLU@NN|$&x_hBD|r*lpGD8hz+KyUv8*Ytf{ShN@5lm@LKhFR
zut@KClXV;gPzWL}{1KlyF0O*?1wqsmuYXt9_!(2B%0w_($YIYn3Q(JnfNbMprl)MO
z=}Dup87LX93d3x1`d{F7@f&Fdw}cxz+Mc2$<}_b5T(t@9UhYhuw<Gsw`r|)tJfMI%
z#Z*Uq<eN5DULv+EMIEK9f5xz?(zogv2Nk4cNauvTe2+#}6*oNgZr9G(a&HG#r;)-z
zuROY{&=cl*aVV$+vilxJ+e25()#%~?OeTLaQ(wgcTp1lktQz<4X%%p)%b-4R%LHkT
zI-;CJ%xH<*icd<Q#Sq=)hh1DLmc(q7XAt$=FhA5VrMhGURaC^SDuh}~zI<5AXG<7Z
zc}ib0xlnfQar$3NlrV)!Af8mCJrDkzDhNG?bhU)2XECy=&P$~ZA#s>kp{;|pCiWRo
z43+q^V3w$V#;c@l!`P6c=~<s>PWUGQVtKJ;Z?QD;E-eolLHJRNp1Ww!33Al>&tJuW
zbKe0q?YF+j>&2vOlkV`{3rqJhg085$Z!H|s3DT|@XH$w2MxE#?r;H3%RFzP~{6t_W
z3E=087}+$si#&h;%tU@|W&rL`-xpecM4OgXu*XC5b3)UF?E|jih=KF=#umN>0xq$`
zG`=Dg-<;l{mHkyG)ko)lWhvDZ*T!AxJxJK?lj&2oYEA{{d|N7`rW9Tc@)s++>rvIe
z7><E}F=yFYM-!7H^QIBR3wvdm{v^)<)f4=bBP(CUq4*9VeZNsU&um#LWUw!0xXXiz
z&s^FOITF7+9UqJ9x;`K3*~~@$r>2yM@bA4Ky6vHxs+-2_df&5$qvI^ePi{Rj>W1Bi
z$m4eWOzHD<)E2prpgLy<Vf#mGEPs`Bbl1c4(s`gK=G_r0_1upk%~R(IF^P5ts}as(
zvhEBF_K=`4e}6d?=68;i@1-Hv{V+udcz!Lr)qmu2CvuYcN#USD0r2Z7gsdMx&!f?@
z6ZErmQnmu9s{`jEz(&*TkXz`vFSNxkKdES?ym~Ckwk>n!v+_yjMSMQh(8v1ZT$P>3
zw3gTzn0T5oyXrP^9J~4X=Reu}t$kK@Km(CdVHx)zz&LfirJk?b<m_tZYWdi-7M|58
z->m0+Pw+<;C?FyF$f!W_2W`nydxwdiOWVqk0(bcYx2i<SbPUMn0Z;R>jf1%EUlZ0j
z&e6igNRtv;(T%bTMg10BmioNi`;B)_F}!kd1IHw0i!Nnd?`but^IQ>~<EQ6BE(TO)
zo&y=YbkxmI|J}pAL$(?opI=2aX2cbcEc~MIx!sb7>w;!ol&Y_FibE})seJ-DRw&j|
ztDW&-aLl}DJ*0fRsZaldTxAN`2>Fvnakv{`L0s0M4tNhtJyhn#3<&$Jm>^QArQK`C
z+&vo;nDdU6yyH9mty>_~3Slv+(S!}AYHL}n?@EBYpk+F#l8*TE)DS{o&bAu~bHGpy
zy#?H@9|3=0{KU^z=PTV9GgjT7580=V%)1g<VzWtocKV569WzRASo4*B%JCIk_e<_b
zI3D%{LuLF0!|IPnlL&;mRr4Wp4aE2ErN%DOz5g3b4&x7!bAbHN?uP8w0IpT@-5eGH
zl+gF?d-$byZi-BTu}C<zIqEFLYfx?E8;Q@d9wa*eZ<7cC`BY>qO3gALoVu~r+zNX8
zQO<CM&_cgbHwa6ybT^#~c-;#)h0cf_#$>Z9VuonSk!lj#?%PLc)9M5ym4{dN3-B&H
zE5&FQ%T=4bez?)rk6iuoDB6@9f(bgw@h=ScDW#>F|H}I9djyiTZ`zwa%JBtgP6z9g
z{b~dgQ4mPCJ9|vMM?*jVgK7nh7X8f5Kl374>a@d&Nm%YFph+yC;3P=<@xRmIUI1nM
zNThc>BRPu;PW@dpd3?dYA>yMk7w_$#=1d=KHwSkVx>YVjJZC7CU;kIm-}8<Y1_;l~
z?y6SZA#pLf7nZoMExHwrdNNw#unK~npVRwq=gW)gKVVpn3BTc1)dBrYH0An1--oj)
zh`eyNck>0TSr|H^@UUeTd`_Qq8VR<Y-lNX+=j_>(2g#FFThsQ(>VA4rL*a2p&!uY)
zKCA*CIMhj!$FATM_G-7WdCLk^8AR4MpO;cwvY1^Mr9J{92ogKa3hh=ztv(?=S*hZ;
zPgvO*Hbn&=4T;-c{wVX1gXGXB97k~Lip&Se(F))U6^>|hVY>@0atZ}9>jBy*=l;+=
z5}W=t3C^F@6P+L9s!(o`3TXgYCp#purt5WZw-`hebvOhk1jsMCT8z2~f=!pe*Y3sX
zs`9ahdirq4`SbdO=Y3V<+qaQv4lpJjG=E0r<4t^{%hdE;gUKLH??F+CdW4J~EBm2d
z;4Y6p+F2&*Hq?-7_s}j9atUd6H2ufAB;gcl@_&n2xi6Rl<(s=2k*K*}rT*_&0Mm^U
zqCBUcr;U80k+N7~cMIQWy|uz;1kyJ?;<g(bF0_Ae*fn@GJgnUIkm330-c2uNE}-)Y
zWAZb_Ph3`+Y!f;B>cLhGqJKHaaU#8!G`St4xjIK0ypm+gnGfuUuS1EDT(yB{18*f`
zCQ^oA_V9D@L9(un`zS0}DHX|4rz=utdu(cL+^Vj5zi_%fXgrZ!=v0zVfu0i~XQWaV
z*EwW@<RQl}32(gcc59~}hVtN&k0K(p|DFuIH13;0^Q3OeKVH6H;TfB{;JU-c<IiP1
zl)GCuGQH3n+EE~ue0K?cm0P90jmJH)I7oegkg|T+esh>j(!mapnONXnY|}n`12|>$
z7;XiqP5>DNh*dV+Q*);nJ<KalqhC-A`5Us)2AIbEhM#+N?LadPCW)UW0iIhWS6oNA
zr3VJZ&sI*pvW890V3P>1R`nqdTAa0J9AD9jQa@~4+GmT3o;7+`o^`Jfs#xwMdJ%er
zKT^+in**d@FXf?oJq&@XKnM|H0FTTk-CMqI1qUJ~CY0ow^BA4nEi~jI^HA=Zv&=1@
zPW7wTdYRKewb^93-(CTc*^Pn^NS>kuqBn1H%Bge91WIWd5~D9`l(F#a^AW%C{z9@Y
zJpTCJboL=>zJPz^<fWvk@-3yuO#K_FTmFaS6Okv9p8=GO0CmrU;u6EnX!(^0<IfZd
z5~JXwuCUxDgYkbegbO`My@dIfNj`x_ag)D=u2=G~<wwK(lglGPDl01NGPBgAmm#q9
z;9p|>7b@>+ZLIv?xj2MN$o~gi|Nn5qEQHR6dd|krh?c6j|0ZQR3?iLUzBPmR^*tgz
zD8z-GxydYhj=~3ixt!52W<G|UkpII;<tAY&d#1Ab7h(OyR^L_jFTwR|2Jm$RJYXC#
zU;WuN!jV=C|BNce>o4arF|ps=j!qerk~toIS#bF8w3aLge|dBw%0Pdea1O@DNECF}
zUi?q4`l4l|tk>-S8NN!%d`kav^q+T2jmo;)x-$?5a74{XI+RYB?JLuL=63ib#FPUG
z1U|5XjKo2O7=>!x)|6`<(W*joEx-6=`C@V~mF=1NscCLBBkX<E)@H^T8P#o5snQjx
zzsZXmr@#klenq#&W<r%x4vQILKD~XUZ&PbDp!y0BBzWNhpU1OqpR*<pUB0kHocB-m
z@SMg#Cmx1?Jd+m}y(g=AcmENF{$^vg*H+&Cb|a7h)3@dnGiY)8gVmY+ZN<UOOISf*
zlqUZta}<l1;i0AB)DhBC>uRtQ57Oq*m7Dbb6*vb3-?<17LuV-$x{=QI_(&jnY6FOt
zF~K*oXmEB|awi-eGv2&G(ErN_;tth#4X@84!legZ8ygJs^Jtw@E5Lm14L`Tx$NsaC
zy(3qSSq)zP?KopFk8sc9bo+JJD-?kb2RMcCv+to4^Mv^n4kkKta88wj?K=!nD57xW
zss;es8id|_g~@&elh-cR_rsi(=hE}H1qH+(0YFS7oPhBe57=X2RpoO}&^1eLkxYfK
zysiG&?+IW#D-h<xfyTz)mC)54>XGxe)Ot&*0Z{Wm?`M7Xq<=>y)?{3NSy%>2rpB#U
zFvhH(H85slxqIv9>y@pY%=LAZsO@KBtPhwqT(IT#2ubpbiSIq8KS1Qj1SUN}$ci1~
z{9DHRBH?&f|J6yFK+?(~5icOX*sngZI=MG}Z8|lpe^TqKr29nxRuCv9zguh;)@=GC
zqz)&3U(7OJI>#c$;*79!wCg<cYN`)J7;}9H{0++U*2K5BJ)1vW7nnQ<-CG#N(-IEk
z6lphs5ZwSB3Fn7gQb>-oq_`<1iP0$Aqa(qaG8or8c0-S23jPhP;XhKdM>9WuAPZP5
z1Czf>5bV0mHy{c_eJRrHx0>Q?T?j?a1BjiIUDa-M4+@_k4o23GSY3KH7qXGa<Vn)j
zda$YKfi6gSnuJ8<6YwrfrxTVaA6!sdt3@DErI6g{XZD`4H~Ogtg*G{Aklt$h3d`&Y
z+<O^m=f?x=@!@Ps(t2u4(H#EVT(OwW#d9)`dti11WHmTOGi-AXXZUnq1Qi~g+R+__
zz{PS)MxR9w$P^G}Q$Vw#sepq|rU`J63l94c@OVoh(qOj4Jqx@bef7vCCF4tuDbms%
zhxjnu>a=&V)oDwK`FPwt)nj{f_gI%ZjFZZuY(`jBF~7iD3)}Of7{S^ajAEg8^_oi!
zHi&oHQ3f5rVgHNrrw}CT_QYRa>K|<gM~Yjj3(1+er$}hCwx_$Z`7houwpC10nj<(@
zr@H11UgC7=%FGqv$6{S0g3=*tUE6<oP2hl|^S0DCGn7>q=Ho6#y1a?QIW-G)vnJnf
z#n<hip8|so8tZI1V7==F^1i_=KLafmia2K}NMn7N<&m2lV8$LcCVpjrdZtzpzqYlM
zh|rFd#4hPMA9l7~4A$ESPyo@BW=6XSCB_zQdDbR>QFV-_Stv#RXZ<*O?H=+$bhv$R
zhPm!YC~iQ1a5-&3ap|=mt$}g7Ur3725Glui+;Ao)soi@B3_3s6nMq&opK({So4V|X
z66w8N6vSl4XMGkWtE8G<2CBrsWN}%9&+z(l*;Y#07|D#GgX@mebdkQ$>toI!)PBa2
zy$wsfBpNhl=HOXB-4v<Dru8(8`NcxFXYPDvZ#x(5#qNsRwmom#IEKquNYd$aa-r;I
z2;W_JupCXkGntkoDHly~x4OD*jrWKG>)E!fPJtbG08Za-D3A5H3btIl$^P2&P_{^_
zQSy>vG>sNAeRkwezoL8MuhoGf62<#{p7ML;=o7X7myFJ{?SKy|$V>bqi`g#>Z^3|1
zDB($a^NGw$qr<)6Z3O!L$)ErB99w<1o%ivzvWck@S^joucf7U@=F`cvgJb(wpUAmD
zp)0=fnKp?-yKT!q4X65HA;f#N4QuNY#YVkLA{YSa2jOY-l<P+H;IZptfU9&VhJ1+D
z4d`)0Eu8R+4d6{HjZ19T{q9}`3!8a`7>;dSzm=5e*VaW|yd+zzr7)RCQn3bc==J$t
zA0JP=6ADv}523rNX^|8V)aTQPm<%}pZm#_b>6~ja@73mh3TAL#vY$?P1~N%+C#COD
zc!>HK?HfO~f6Jh8>s(9c{y_hByq(DFoE9G+;Ap<O@m}Z+IaMtp@iYngfWYdwxh(2_
z(J)Y!5aeUuhEDyj(y(Pzl*?70Bi%Il#Z=g<sf)B{ZKMGoTI_TzEXfZS_FZ}xVgK1p
za_ZXR<o5RVfx-pTzlKoI7pzhRrY)&I>#SGo++Q>P(<#3t@sIewUklW9p_C%YzwZdO
zoFOem`q!igvsxj~{imJiH!HODf7%h{et&2CPkSV}Trl=OU)$GuCX)-r{nxNk$^Rb>
z+sMK5jYC*l{9pnY{2y5+q<uV^GPA=VY|w?|RJ6DMOKuz)djXXH|ISe~B%R(Y*$2Qv
z<gZaIvz%`4P9BPw4@z3vg7)5;Le&F!zB-x!Mp{-DK9-<7E#zfp!%Y(o^mni&c85VW
zFLfK}zS+a6h$j>UO!4x|e#o(S$kAWK5;~qy@BHu&$;eE4cHEYJ(0_tE(5tX^bA84!
zhxZ9Oirw_AB^#-P!WtXfd><TZ**62YELZrZtNQu@sHDsXo+vVIw5-oJpD*abd0TQ&
z$QJ7YRTmWS#@5ysGbVM!T(}{ctG&_koqj7Qy14R@M8p;bY-x^Bcyg=gFN^jTWoFmf
zOf6(i=Lx-a!E^AC$gNG^^YXL(o^2p4@p}Lx*_3<Xb0Nhp-$P`8>>4jVJ@0Y@4#ii8
z;G6XJ={ed>jVmuVZsTCK#J=mKUCS<>j<4RA=h;?ogk&I|C}_OL8R%Yg(viCQ{W7w1
zcx$&=T>6SYPh}BG@JU8&sObp`h_mIzdRj7x?d-}nd-Hzv;9GqluX$k-Or~<+*U2Rq
zyM48H=kGrseK5D!3ZHo}yIuh2wfv^bj{N>YtgpEcpHB9{xpCIc7QWFvH65Or9aGG3
z`J;twWhEJK8!$8;DbBQ?>_c+lRc66o-4Vgkns-Y*LQw*;vU%+FjoJO%oN6PyAJ%(j
z7VOBusb1M##+KYU-$~PS>6e)LkDyO_Pmwh&5GXtTRV-Fv9)^e;W(5uRGMNy->QuYe
z%XRhumiYwGljXE^?wjGU3L|HcQ?<y`x%S}SquMxMw54iaOqckWul2(r5L($o@G=pk
zk@4OD%|UqP;N#a6mn;eiT-$TdY%+p0peRbxnTf9ME4~@5>|6kZF85dYN1(j|e`cml
zh}j&TGkbyKfF-baC?g{PosAY-2_#QnSU<h<I5KLLX!V*R3dh_z-OnfAz;~|!ImfKo
zFX2U(4(2sX2zHs(ybo)*gO9dmEd$19_-_kc_RCSekLV)C+<?E6SnPO=N$z>+4}IoW
zFQyJMf8Dt<?#K9zZJg0t(g0OFG47%5kD!3D2jVV!<Iy|P@allDX|u6UC$yf+roFiP
z-D{u>@yq+Y?F&<it{qQ%L?~l`u)&H=kjgYKKT8;NW-rzW$X9;l3_z5RB^ymhRz>wA
zjFj!l-zd53A*r$1oddosj6RX0*_Bq=U5-l4hLoCwoC3+qk?s=nI>zBZc7_bOiUpK+
zy&oC%c{YMjHRCrWe9QfInj_IQNiCW?f+hY$v|0NK6jg2=qFjh8_sGyCb;Iqh;2fJJ
zUQXf$xF(xB*N+I!wIIc@9}q)1C{I3~VvaaXw%iPt{1WsINKe^#dLjlEUXDD@$sSi9
z<yB{VZqO^PCYDE@<_Ijh4S@D16IPjx&+x)_<9fReFI(~Sq6-jd;PriZnpVdxXNI;y
zuz5%S8z}I;spfLScsOlc({mDue>qY3btvstoUkFSi<W_h#M9TrVxbZ)FWGG+D#3!~
zu-CKGvj_5g|Fm@V_l;^1Fh=Uto`hz>ukXu}Sn}MF-?cCbx9iMz(4Sh*cG;U9tWhHW
zAhbFWT{!mhGJE%2d$2Rr8wcHqO{1LgVYa=&)yFl5W0bCwda}i<Xu>6aX0Ct42cDJS
zM*0`M_!du<%)qlBzaR4&FoxBjKY^50=5~hGZx=W|86iM-<?QA!WLhyy5*-0uY`yt|
z<t$>D#A3;+eJBGA@Iy`X%-Y{3=2yRezJDKdHTSPUZNy_VZU@;=pxer5y!LIrCZqtG
zZ(}nibC^GAu87*bA3MvPTK@3c*)$$L;fd50$f0K!)N>1X#yN-=5Y5%f!CiL#Ig543
zgoVEwmlkrQ{4TAaj|^?mhNSv@GPiG<zxf$iN-$|Mvl+Cx%c6y8E2z_nA2K!QClY+k
z+hX?$aO^Rh`Dn<gl9+T%w3mg-?K^5a3pAD=TF`G&J1up;f2eA0%tLkYKJ-qtUz$vm
zad$mggls~#=EPDTUyK>7ZSBM9mj&Rvemud8q;g$(*B9B%x7tyb`RSzIeYmP~@=i<v
zaH8aAknaXh?>?Nl9acjTRj=R-2>2AV^<^M{mi>)|Oy`yH?M5bl6drB!5zJALHCrxo
zNnVzh`}vxGr}O{?*)wyp2T#~to=)~VEo<!~=o1brygdH;pYX=>J}9JmtSh;!g8v2G
z1*)}o7!izkRUIMmmMG(CoqBo5t-9Oj>GBW1giVX$@j_nsWi)ug3wdZKS+8%ZCyu@x
zJ0x;3L(Qg6kfQQ-#}LGoMAXLn-6umH-DvX2Q)Cg{Dzg{Ee~cR-3g@z$o!7<Ng;J$d
zjcpjcCmfCp1XaHar+;P%WZzK7gW^k-n;EDZY*yEuL+^9(Z|l9|8t2gPP;yG#0dA!*
zKZwR6)U*+bUHa{!$-Re&;&mB~E;qh*%D9oM1`nl_@$pg<Bi$1u!-m82OVs>qM%Fd!
zYoFeeqxtvgl4zr4Q80~$-P(Z~md*`tx&v2RR2ud(qHVBdEMKS$rj+S*I<N9F2VUpA
z`1hLTj+9^2Y7bNm{?gKw1gEYpQf16(r8f$-T4g)PV?(C&YUF@A-)OqN{2gUA)ylc$
zJD3xX+c9o5-W|DyMO!SoR!`BYOcwb#z2y8~%A?Ij)!1)4{wXH!-oPA=`K`ZnC$<gC
zd!)7+I02jm&e6Z9rPr$u-n@dfZ$hM~ic}BMV&=A@PUjMJi{?^L8|)ugPHK3VRTH(S
zQaW_P;P4894OXgSH7ajkh?Kz-ywvP?b4q?Om+b}6Dfa}dS23q3pS=0sFEUM*inRet
ziHlU_cewBOP{!sr*6O_YYVv2QGPtjbZwOvuwd-g^(7xU{Dt^B=)k00JMpYzheQATD
zdS!E+`doaW?vkTfm@j{z)u}@-W0xTyFZZDl?zUFwugxtK_s%Frb&G|D#x4)u)o762
zk^k#!+J1=t{B-HB>y#HSs}(U&<8c`~NXg6Z9W4Dzr=_td;@!UTw2$Q0Xp0vXM^*F9
z&A;pAwA6|EESZFQwo+B;MCd-(;D&ZAmrp*<(<w$8s4l%<T>9Cctt2*D7+;xDBt9Om
zEJ06Q6wN%BnwF!KTmV$9Oha;+_Yu&%P4p4%c)x4D@!dc3RC`v4RsO%4d&{6Wo8Vs*
zmk=OGf<th3_u%fq-QC?GxVtSLTo-qDcXtW8xI34;?|aVw+z+?z*RwUXRr_p@bpLwh
zsp;<N833$AC?4|zqm+m@I29SUUu;+pCeh|HQqhz05=S^YM8>gL1zw%y>Ky<vW%mQh
z(vzI-@_5w?^KoC%<{)Fnlx@#BKLEO#i(_?6dT(+*4z1T-nD3yu<&h2Yc34}H1yTB@
zt+~~k&C%R;U)H=76eE*q9C)@ndn#v@jy(Q_C$(&YFE%$kQNoyPK=Vptv!%RNhPptp
zv3OVrqa+GrG}%Z|%*9H<Q_iI11F8MSQRNF_+tSSC<hnnswTbUddlsgWS&I8KNnTr_
zT$YvQe9Cz0myy_)`vRaPta=;CDr_FpJg)ePL`8Rt^UaU5jX=D*Khs`X?d&#@8ctnL
zw@)K#y=>izXX+Ni_)^V|cWosw$oNHMV)#GP7G8p*QVaMv%wK)9UrP#W@Z6LT@f&4H
z)0JPHZO1ngsLMw?9E96>Z6X1+#OKFfZ%4UUU0e!I3mzlG0lWgd%{w{9yxB3TQUkns
z8{7sh9=rghxfaUqk&jc_^U3H5iX-`NwNAgVUuNFIU34thRm$>?R|h1I&o%2C8#6eh
zxP9p9gubT999yk%vAPQvw9uI<a_eH}{o<D{NxkWHV09Z4J0`C!HC&Z<)#a<%!P<Kl
zFR)Ilv5^{RD`m^7JM(RzvG~)FpL<&Vb}GG}EJD?E)i5*iFkBw~YH{{41;T3Qmh2at
zbD8D_mUriL?-T387%e8~EZ;25Ti*i6i>p{+=a;F;lCUsJeo@<^Q7zsouewO3U;BDA
zuUttbQ?)pVeyE7jv@2+3Q$>Hc=zVr{exim=<-Efq<5L9HDxXP7J5W}h`0bl57SI$Y
zu69>>kDpYM6hnRCIW7=g%BQUUx@=^%t3%s()$tGUR~|CO*CMNz7niSh2dbmbX{9XZ
znK<`l#7ATWKeL|9`)T%c7w*T!=2r6tN=W&CdhtH?nkK$)?o@QvTM@rMS@(ii$>|iK
z-yhxU&qDV&hMRU}zP=0*)lUO_lsAhF->CtWwMQ5~6{<;nigD%C_zLzI#j4en!^Fl&
z3-+odCBhWP57Mx~W(ATXec}nb4Y7EhjQN_kWQM+vE_cZG!pz<tvY)g=#ZGoZN;r96
zfUl^g2N|AkM|NzYJ-7XgpIq5q;3C+i9F6KUC!fqa-N3Ph{bAcRxf-3XJCR4rf4v$V
zl7%fx-foQLX?VBI<?t!5wE^Pf5S1mBZg?{x7S_D=eO#cIVjuZZ7=h#Ng<0TCqHq36
zFUNZo787r3!olTWqSDE6Xy-=V&oP5-e|FD*BbnI-?`doCjzi36UZu5VYpg4zU1HU-
z%R$#>*uI1Juqz<ovz-%cdZacMe{c4kM@|&`r1TkbYvUKy*@8bUP4~u5Iqav_HJskZ
zxGQWG;@r2o4%LaAmA83gz9TrQI8!bc-9S%j@wZ@^;mouPtuNXdkVr5uUcA$Q_@x_V
zMj_L{t#z2G2I4~nTe>&9(7Kx2nSpfu_6k#Eyf*mbgPypiKuZsy*`sxCaXW(U7d^;m
zKf}fECj7^OYss&BqmW&NUu`NPI5T6r!lbs`Qjj<H8jY8BSLhQT!Lv9lG`COBWu6gA
z^3`qYi_5&hr&dxfbG2zrtFYyLq?$di#XAjLPHXs@-z<6T-M!t4i`Jt`R!7`RiL%>q
z?z>|z$Icd!XkD7_P9j{>gDXZ?R-I%{F7Utg1-No5IW2@%_kpV>#QmvFzD2KRjqpz>
zVw#)!WR-C=Gnsz3=rkIDx6w<`-^%)fy6yZjpMp4P47+2w>Q2!;U0%h8Koq$uakbi$
zoom^R$u1wclFiRkwWW@8@ThhHHVbqpcWWWP?rO4RxL^!w-S#HWFUacj&`GHsvOfK#
zx$JhuZpF14Vp{m9w&1wh5eQXBLRWbra3Kkr9_VU!<uu%MWDtPimRjM>q~!7(c*+pP
zb6$nJy{%YT)#USRJjpW~a?8534p|t8zPo?cSp7jf-(YPv)KGqqoo45|M@@Lwbce6s
z4uGy)Z)LO4oY`_&@M~*sun{>ccq!ZXwQvMZfSd4fL4!XfWmw&r093sU(KdRl>#smv
z?%l=N%I>nsjduQ8@`ouM+Yr#J_fa0uGF?6s&s)$Gb#5K2Vj}_el`G|bec;U2lZVSc
zcfs1@wez-*KF8|$a4ewrxvSePUh;TTbg9uMXt%&<2PwNZ-#WzgyzKl_u|mL;OTBf>
zz1=Td)Q2arQR_vF&S(6*d$5<%Mn@dE;OM)-){3w?CCWdXO?tL3^hLm3f%g^AyhBD`
z>}BMA11`BUHe1rAE1u~rl41Wj^?elAedmtLVw(;B?48Lr8Hkyc^f6=O-Zd+`HMH$s
zhy2gP8_y2jc;S*G6g2j7`dk{UX<e(uw`8d~ZieG`@pF7vc_T;>(qza;S?`vvW5_=3
zs{y%m>631r7QU708KCC&ANf1iZz*)>j$9`2G3Qk2Dv=VFU6WAe2elMjEoqbq4JtI;
zlUaO-Yj$SV;T0dZ?%bR$M^|k6$gP0+uKBD_Hkxwpx-QOhql#^AzjV%9E~W7ROr<x`
ziq1!`TD$7#BcwxI>RgL&_K>pb=s73)M-fNmnn#C)J3F&7vFL6Nm$6_^a{J;J>#o)9
ziVl=v(+d#}hOJH|irD7&lU$=Cpi{)%yxHuHJ1Hp(i=#{{fSAs9Hpb2GsQ??i63zWb
z8oT>))We4JEr?b+`|3c+$2$@udybXjC>j8ERKo$B$t0$>PpiOJ4cJlH5e=YoJw2GG
zwN%DP^6b{t>hyuWZymyYEJFpg%_bhnq$>|^onDL6tFtDloU)Pq5o+E2x!Zhn(}uHp
z?;6#LlT5fog!bf&FCJ_FJ1-X`x;~S>x*QqC)7m4>norJr@}jmgBDLY3wV=#(V{gYX
z)Tl&!jq$>C_OX)fx^5CZ_euSn#s<}Ohh5fK8%eS(zmCFvAy3__^bbVXYK`Iyb5D>5
zYZ4fn^PDyD++t(ph@!h^ZJl8+q-x{tK$-FCk5_MX&{XM}b#hdL!fJeJP(@xQs}Z-M
zVYAEymFsN|vZXkOa-?C8{b%p2Lb-IjHt!4E!chs5W>4nU!c&8QrQCV8@`}a24#iFD
zlQ$KRd^N54O~*<uX>_5~pKmOBINRI_LkA62HoXW?otts~xk{<LW3?MDOI01(-mAOA
zLcObu#*>5&ux{7cP0Vh{_suQi>b4>;3>}yOuBt67Q(M-uy+11#YrMtx3>i&VT+RCX
z5!C(m8hT2-^H833?N`lpJ}mES6^H0w`l^cENpQzMG+5=c*HX$nj?Y%zv#$ixjj_vZ
z<3l3m<3*C0Lzcz_fN<gmpC%f+qDx0t^#K^n@nY>+lDn5$PXEOgt`3&UX<EDkU7h(y
z45|QvI+}y?h0_xf#z`x(={QaRaHE)oDz+*Ql+)29`jWnw^~%oKv~|Dc7`Ue1t6IMi
zi4PWWM{@mc6>sNe=f8&)$wt?-mq|>EOu=oL5GM(ULa3|&M>G3SyerYR0Ync*`?5`=
z1RpYAUGrtNtmtTRuIcz=mk<P!qIj6SNgpt?)uyfF6v-9ioU7pBZfI3+oE93j?vpb_
zb&UQDS<VpeRc<@TaO|rDNt#D)Iqa{n>*l7lIWowj%i|*}PR2LocA|3JgN^5J6)!>P
z&ZR}(+69bzot9q&tabGkV??=L?Vvk{7hqd#)z%kM1y<_(>*gGP%==6gJ1>7Rb-lQ1
zE&2Sam$oPH5P!}n+OJu=9V~xD^uJ;OewDEy>@FKHu)LIC+_r2Rw@I$Qmhi1S9d|IA
z>fqs4(GL-0*wLmp1|VOnPIRA_vRBMsu51A2lp7J}mpYV#5Ue$XRvyb%?PX4^b0qte
zy?GbvtlgT=I@S(d8FW<L%gno6$nG8ZG(uEV39KyJFRv$vsVL!#j4(Vx8h%TGY_uY|
zgnN?Sca|L8qR1J1Y;BxRKkj&MWIZW!^p)8&0J$_B<J3e|<tcSex6RnHG>frc_FPLn
zOkIH2f|{@FD!f}?V+GvaEeSlMM7*BM*abRNv*AY8+v8JqxzU=L0_ilqn<J)ojFdk7
ziV7oAVl*bF54Yg_R($zqKv~OTDa0rtIWERhHD$$DW%R94qWz2H3mIVX)HEsSkE6%E
zii?TcOkP>Ru7xM^PIsF(O%$g<5v;eRn}}KR-Z<U@Gn)CPwf<sBc&!qKVPjQeuOW@E
zEfx-+N2aP^8|YBD0+Yi548u?bB{R`F&51LKByVrVwurFzlCM6kKo@J67F>Ecg9G4!
zXetyQ_PmI6BG%D2k~F$h#gt+Cq>U$>bi~xgAALDp9^FW+=XXm8I_t%<7ANzV2Oq6K
zOi6gS*nMUAqws?3*MMhkn;^L6dccrYBk^={ZuL}O*qlDpo?**d2HRYORCTl(4w)ef
z-g}f#o%uDJG(i)h5JAGLeKNG<+3wdF&&1Px=e85Ryj^rtW{7z0(fFah%SV({Rue+9
z$IvTl8d}nuoal4`iOWEC?@vYP6<5%sT{f=}!I^clOpXi=E`33~R~k0^m5sI55G%F>
zb#=ZgHd6*4*1Ji+3UTufn=2I+tKpvW={IH9F?R{8a6Y}^vIfM{)rP?M;6QojMqrs4
zh~78I<+FDaGdv?`Iqjf#s>W2{uTJ-+`N!N9m>qcVy)fcX>>FHxcgT*|KD3m}SYl{d
zPdM)UtiH9nB9-FL#gtwqv3d#)ta<PMeL8+fh;;JmhPatU+i>%Gj-;k@pySQQKINQJ
zt>b)O<T0wEyUs2OE5~J=ukj?g+_sKfZ7>f$^9d|mf*YU}w}I(AZ|%&M)Bm$qIYacW
zf=c7^{iQXo)sc95_)%|!%pCNsqhgPB>y*2W_}Y#~A|+z@^%hJj4@o)WM~U_7RB^}J
zmqn!w3+&j&*>~=kTt^w;d~Kd<c4bENYLD+lp%_k18T*E5r|IlA_06Hus3s4vR(GM%
z8hGY$*LeHRmejN^p~WIld5M#3Bi>`Ur1!Km*BK=qvEEnMo4gAtcJI=(;81W=dZlBz
zX~XN6vthd2V1ZoJ?rq6I2;S`en0%)NQ7AQRwc-#uR>QNrCP3~g&Se)HQD`u;lc(Me
zR1~YWAE;o>&#bG=_5c-rg%rCBTAwbL8-WeyZ1=<i$vGR|TxhJwRy=PS$a>v<oAk{f
zdH?>vw$Szqd-k5@)5m^6K#qyT?VDrcA4ZxN(t(s)#Pmui#9x7bka(7W!ORY9e1X!}
zWY+U6Pcd2Jmhf~uhngtgo9@mcwzni9AO~lBm(-qMW&T{6s$M$N-oJcgoYa5(3<*EF
z&qQKJBUC6>LwnR#P}_(%Mr|9}V?U`G`5AvSwwk`mAf?)1cUsTI?b*{%i+4P}NZ$|(
zl_!%Dg_}cR*1tP5z`gI41O5q|=cfzn;6OrbKLQ-n>*`eu^dN9sc(J5BLmzPk_le0~
zP*t`ZFtJq~5nAJ(X#Q%^GGFu+GW%1(=yoIA=Zv95=l6b3`Sb8q{59C?k$C=Hmo05k
z7+#5sH8Ug~CIzG2;Rn)fsHBtcRwrt~k-CC6mr|+HE^cx<W7O@F*V|)ZpW(gY2X!|E
z1x5SIQcpEo^|&tRVqjx-kCZ8IQtM-vKeh~^9<YqLt#Ji}a;Oh4(WR5QVA}ZJk^b&D
z?mUMQg;oq2pg)WmF75q8UDc6Zr1*{S^wk_W9-8()(%WN160%4b+^{DJUWl(vKNr8#
zDDa+wEN)XenHGHlF4@T9Tz!4pMk~?Rs#+@23_H_=Cl1FZ&nJKTZV4B*5y-WaG@*cM
zz9D(Td%qyPU5{`C^E@Kyw!WvC2B)d9JL3usbPm&{cYJOyD>q)==H0cWxohtWnT*bR
zKYXNFxzxRqW*sZ4Zd6*>^B7eFdORt48<zpkaPp$qc{K<Q^p+bYK1BZ^ps!Y@`%Bd~
z$bf-0L|9I5T2P~<wzFoG5c`u8|0Maj3TQIwOjEfTW_&xT%b?-SnCMlkv#Z6)7XMi1
zTIPqgXz-W(b-ATOd!F`_&YQH0;=ONpQZSawY%G=Ziy*F1O=WL)Zm}5!?z2<%>lZxF
z`<#zUO6gqg8?^ivJ!hXMcKfgpALd<x9zC9oKm0dkT9a=L8=Ea#vF!nza(oh7wdu`E
zwn(--#>F?QD~%|3NU?Qi&=3LHBz8}t;rN-|fq(_|+bbQ%&4%7{eF8ds*@DTO4c3kQ
z1s`!Z!V2UsY)=xuvNr`cAm45%-VL~eW!)0*LN*B*KGHK0GzFOFZY0@QTMZKD22^i8
z@&p7F)!yj6`ZwgAOD9iWkZf0&9ujcGRbgVby9wX{3xO3eY2w`O8Tm>jsB>!rETjdJ
zNik|6drB<EyQPYytnGy?plst=CT3>ybDcM{Biij94Ek%go-NC!pJ{SyAo{A3L4)L!
z_5~*TBpG1ak2=DPR4n!*{>!v@bw+Pq*==A7tZiW=Bu&~Q%Gfh3@ImEWfKQhm&|ioT
ztTnl*KZCMG*=+BRjXD4BP;q-(-^i_mDm4h~-roDl{!SniZbEK7JKC1P9mI-e6Zbw#
z>_hO^@1j{{w|7X@h=@@6c8?&_U1FA{kvh)&p6aiw^svaj(-4~+7nYumlQU(;vUeF6
z<P6x8V*!kGmID^rD(B|NcfqI)CYW!kI~MKEjq!_o$4foZ<BoWW=$EDoR%O<PF*3Gq
zpo?F=m`(>V=TBpuV`r5$e|S9<hBpF{%y+<L(r~<_6lRm;W=y9~P8r&c-%(L^V9fcg
zID#K4qI`*eoArc;tzybO4+cy9n8iiUuQ}Sa-P+zpoeDK6dF-Ib2{-8Q_QsCzaq6OX
z0!P5P3?Ke<>h`Dki!4y1X9&}7YSljq!CMfcmSc<TZMJ9uGL?69XeM;|B%f4+cB%1f
zVCp!<R_a5(J}9e_EsPf35cqA$i%H>{3HE48CN?DF45^)MUY0zm)BwVe4r$NIq_q|M
zbo2MEpC<RozE(?f+AmbU22~5Trs)Iw3F~=)M8asUtLr`=4&1-~<|fvmQ`3JP&v5?#
z<3^YLzsimN|L63tScYO%{q%`5J5A)NIU?WKXyHkM-!<+!O<jwUqU|jR(8#<j6yRgT
zvb@OOsHtQ@5^#gfH5p|)wHS5&klP57b?dQBEoxIU@o1jnBmqQj+A%f)dZKa<S?L08
zT9-K=5lLveiz;9PHuzNrU3-pfiU!Z|4BKnD?|i=FTYJO3zrpM){EbU!EH^=!`{<M8
z*#Y0g5ni%{tFtVz$BK^d*+B;qGTo5oaz5`8iG142@M_g6wUN2G0>85n#6M(JH&zdI
z{9%3!ZRZ+afDULx)^TmlYrT~E#vV}1t+TOy>EK!&scX9GaLF>YzKRe~>)dmHeN=uv
z@iG9z*=dY}@d05UqTz>!-j4Xs9){NSxyG^WnEs`;yIp3QtjC6oT4g21HQ#1$TY8n*
zB~G&?szP-F)d0v$^9g2bg4d1I^B461!tCsDOL77#ds)=qNVwk$fc_*veEiJjt*a^{
zP8fhY^vwG9ko$?z(QsKtVVR^7fqbr$n=abp2A?m|gLlKBY~_;l;fVF24WP!C-h-EL
zS#_;<btae-YnVMjKD7$jp7(LhvI**|Bcf_DYR^&S$MAiHnVEB#JyH~eO@&-~G)!RB
zl0n;e;tv=DPuD+_-@1)nKGPj)b1|FQ)2Op1Jw87=opYp}1_kxeA4U#L1f;c&qB<K5
z(i7hBS#F5Y5qHe^>z}$?9zJzEHFWl5=cN(+2?&u=Pcxu3$Ev3B9Eq8r^U&hbC`_4;
zzQp6dq03XNvElP?=a~YP?cF+kz7|vaNQOxOaUkkgUWH;j#4S2jmVou=)L#cU*iRk%
zxJtDmdLz#J6w;&qY(c??cbk0sYG5UzjgC+3(NM3A1;gm5c?zO5v?aL~dz$({w>=Nm
zM@}SOtz!a#v%!WepiOIP)qKjVoS;|QV0^&~iq`OXHScgkMeZI0tpxbWn*S0O(ULd)
zTz!)0?A>`9vM~`byjp&}%$+ddf-k+Hpmw`Z!?pS?wj=W*ZvE13Zg-kGv3?M!Ly%+}
zz|wIVGCE{dGd`%cUna225uV!90>UUJCU7oK2Rs?>yO4PvUZ&v;#}~a^eAIMs{l3kO
zy8L!b;l)E^&E}-tU*Wp!nZi(-Qn2pkhG?)45Vq+;to=|d;o5ncUw|hY&erM9vP!!_
zMWR|rI8F56Y|gA9j1*K`7>Yk?UPKQeD38M$L@%{r>gbYvu0I;Th~7@yTY+3?LYRX3
ztRqN!9nlHjxR^Rpg}0O}hp{oc_fzX5>(<Be$%!pGpeMW{hn7+Dk;C*HrJ;yx7Ezyu
zjM!GANv^KPX>j62{`3tCXO)u6{TL|Hk4$Y({tRSLsZ<m9TMn%*G~U1wof9L(wCc80
zLAQr(U_S4tc_FEB5y8bL+!luN)O#Vy2$idVy)FWctg>>oCo2dEhDl$2zp1Y;*Z$(C
zN)9=(FPsboi;92VogdRdWD$ovsDnUyGAk*!V>DQGlP4(J@l9k7mtdwbX|FyuRU|<z
z+3fqvwOWS<H9D~2<G2mB0!(dO(*uo%OyqZqUPaL(19diMe8wb*wx0D-HJ6!?3&ixW
z3;OSOnUBJ6RwYGC(0OpLTND#=G{&dDQ9d+T1KA28deDB%8vR;DxY{Kec|h&}P36vO
z<bFF6fFdGrVS+?@s;|#)>a;GX*6fAKw_sTx<k_26Hy_|mU~O{x6WFq<3pu)JsRst*
zNMz4%47uEgFAk;6W)Dl_|5AOvE57)MvZZ;-WvXo*A!xuK`e&(KKad{N|LwqRzw?{Y
z0eI(NLY(mUF?0irAm576+8G>#lk=P{?_+VJt;%`KM6wsTPgSIl>!i^s`Sr!{#zcG$
zSWlEspl?mG#Mxk|&KG@ioP9T<!eBHIQ&{o3E(&KJ#>B)#<H5Y%-=!p1QZk9^IZ=b3
zlyo7+xurS(XPVT{$D*b3pY~yUhD<3`3F_*6{mJ?Y_9awQD#un&(6^o(=|0;4{#))+
zqNii@d%B+?;bo*>eGAB%^PGD5Rn!+GbY<*S4@HY(s3%)Eqzd}Z71A+c<j)$ozE<&w
z{B<&3ffdX?mKKmFDfWBLwY55pFj&S=$x~YF{m7iT4U-oeO?KDIVmkCF3C*`~e=yIz
zs_B2?ju4`Er^JPr8)066hfY@xO_zl(mjsf_9a-#s$?TyO{zybLm6k;HBZO%ztkq-w
zrB2>``8e@hlX5RXG5MJ)vqE5;TQAejq2#LK<{4&BhO&Upt6cO@{%rPMt9`BJ79Y2u
zCQ&g&=eVGXwb;EHTDh~4TDp0at1NbM^ix75AAY6Mgq)QM^q%{zB7$!X84h(yeV1a2
zO^3xsFh`P(`2r%HP}&;0Glfg7##5kxLPs0T*RSPDO)`qQ>Y%3Y9{Drd<$?`V1rF?G
zUrZBih9s5Q7Jp3&QO#KX%&Cd)F9H5OC;<xzzfE{kC~zrElvGE_aY7=ra1<!;`bj6p
zdkXB{n_Mp!T%)m8`s(!|?0ZE*G_Rye+dwXji~>F%;UXtU)l&2Vs+IV9+qt=hk)n2&
zIVK%1aoRSF(U;PSuw!RUHa_|e1MPX0PGzm67>_3t#Mf#MPh~1?Zyw<yJ7gu#@@tNp
zUn#$>@cZp8^w|?D4@hq9t-?!bE&r}Hj&^LXzo@7h`jC@$;YW+=c=jpS%BU_woAc{<
zR46GGO4YSWnva~mx;}z%%YD<qYr=fV(Fu@|UrcBX9pm>zD}M=&x*PWz5?|KJ7(F8q
z2f?o{XB~Hk>hU}6e~vW$MESxA!H<lgwbHV6B~Bvh$fnNPVImXK+AZ%x#{TNWg+9R8
znR45RCloduTz*pCa&06_&fETlr`6D|sq*WuN*ZRQ2{rszI<p@el`>D}#7iFA^JT})
zCnrCNm-np<3bAQMEkONa?Lp`!rt>ciT$vLsNC;qf#2YIUBZD`6V;8-87lb$7qt+fD
zK<W@#7Cmi$oH#!ODd3KPQRG)$83#$q)gL)=<xI`EFwG8AK*iYTG_mX98J)n9PmMzM
zUbqXNafl|<%A2VloNcr-*hQY|lEi0bijPGL*{QmIPRcHJWYD9~l>Y}xSGez8s$w2q
zJYkpBeLKCy<Sz^_xMErsxR?C?s{H()5h--P8b^()gy<C2-qnjv<}W-|C<Q#3%)hD4
z)=VqZpWP;X+?yozFhJ46vy=KRZ_3jqR&8{DdXvTfcUoTp`XeB0)HlT#%mqH<1M!H?
z=yeW#bbd`@e4U)y;tc=t{y4UVC$$!&7MVp1f4k^k!HtjSoP$7Yp^2$uyxBOvJL5;J
zg)E0$<xJ+KiB7n;xtQnb7;=8Fm_@E>%CmC3wWKhEkgF;LiR0LNeg|Z>T^#D2g&Y%&
ze$|3|r9vWg&cf$d<=Va0z@-jRm0sj_+d`UqCW&#R?@>7=w6Z6fMi7M0#1VCU7m75~
z!-^ybWOURf)c1^{IV~dI4G`$VHQTxtnxNxY)qrUDTm{2cUw+VY8YrhFm~lRc`|O2!
z*2i>xR<nxj&!->w2qg7#2g8r(>Sa$%dQTMtu6X$Jo=!Hk=Q<xscm3d3a|#`AogF@A
zh3D5r_BdKVZLP)_urhsl@tkkR$DVFDHY$T<Duww`KX=}f<@_hFqr?4Sz)02A7EY=$
z<cmF3@CCULd^V;((4G{$rvKZ=WUwGnweXRG4nl`GBHs;NZ%Dp?w&YaVE3e`9kAcqN
z{vz>!G04Ef^hs9}3^I3C%z&dK`J$h(kAT^M#)o^_b9S0_E2nU=k3p29ul9<5bm?uD
zF;e2P&nhzShpKMY=VWRTEXFY}s*HLNQ5wHJmVG^*ss}d1;U|eT?BNyqhJ+=*gkk6X
z3YefGgRnQ2mY3Ssi}`~YPnpleU5}V?Xrd?`WPn50eP;J*mrt*~smn#NOO5+z&D<!v
z)xrk+QQNW2;Bw@tQRUuLu~UnA%UOvc{<Mcyw5mDS8o@n1f}e&59%fw{-VNdPsdup7
zsv581L=jz!?8G>75s1{E1vqje8WvwX9SNa><lVM0tGNwAGjJyM=RWQw5vw`zgU!>4
z*eN?re|#vA_i}RH%RjqYOyy$veA7RQD|S;fcyV>#yWSlGw)yeC3nx>+_~B<utKJmc
zpo|eC3zH#X`b-|z=Y<ke9sN|nO|ZDgjzdBmoJRbYso*ANHQA4g%Md+4=^wi!`_}lJ
z{l-81F$J|R6l)v_Q_$1seBTC!EXZru3E)Zngm+?aFc}MX;3i+I>%HprQSrm1;BlD4
zC4l{QEcMB&g{7_3k;Qlf%#s9BeSHd%fcC^w^4Z~uGJd(>BZAsTdhkOe8pHhvOU5TT
zqB>1l0Y6U)l!y;F>yy*aOi{5m{nqA_8nKPebp*}C%^pqP4p_C0ny4ekS5;6g7*IJ~
zQT5AqhH3%?wdue4Tz*CQI0^7(K;bVpPR_I;?qVV&h`<}HgKs1}X*d`9WB>g2yVJZz
zPYY|?XPw!U8oSWwC6R>G$GAAZ<%A213x6OpVcG9cxVx=G*z2@TDwN(iiJ!iO_+TL@
zf%MtJz}Yxy0Jd!TwX}&!3sZkPc;p2ODyt`Y00aypMHg~*EAQdU&(rs|x{1GFw6TNn
z-+zuQp?i4r1fPI<xF>o}L^-?e1vZrVFU~X^=iu<ggp#)nt#D*dElY*H>f+sM%JEf}
z*|G&z`0K2HTJCT|^(Q7nF~lF9ao%1)!~d8RfyH5k`E@&A^{_!zUg3pSTFR33g^P(M
zueDYhCMIO3>+3$PYnlbwt!^^5OtWSzZgq~Yhz=xu6}w9yzK8xuB08U3(qgJHy;CI}
z=?%tT;~2j&DS`8-A5Hhk#Q%TV5@;&tmX@?M87psH1zAXsOlO8CcqFw1VV2#OF_C>m
ztP}~)gdi>w{QikwAOBIyQR-^uZ0(w^3px)plZ5ytg0gLCG)xSmwc@eGu1^aEBz5V{
zhiO-r`MYo2QCXz1;fHKelo!|W0VgFbaNY<noHnbfHuo2z=B2%L0BHOr6AVq?6J7UN
zu)^oHE?Ja_{rBWTpLMbN+W+3qd;eb$H-9=l(G^vgNJQW1*F%J6Qj1k!;}ZRky`e9v
zaN4h;f|kpZ7OCRKj0FG!Y@T|2cF$Yf9iJ)x1?cG^s&TxUUbeJ%Fbh+$g!L266QCp&
z!WxkeW=hQcY)sK>!UV;T7EVdQVt*$1Z9=wU>gZ$-vnH;0W=3&rAGH!4UHjgnEe#+e
zZ)hIr&e7;1Yzh<sSKgH#O%*lFEG=Q9JBt55iVT%@*}Q=*u%K#4_-;=tBX=71PvbLV
zzUVAOMFX@(#kzCFV0wi*6ssiV$GFT-|M;~pRPuNLedgtr)4%_}o8n*Bu@ti;hy6=v
z3aoJYq07!0Tn_N509~p~(deMZrs(=j=tNBDc-Gebi=54`8pH0kmtQdNqS1sv7utUa
zE9cO^rP)~(G>WEu8Z~D7AJ&DiHe`h|+wqzop*gtf@(KT7*6im_=t$YTCBXm(H3e#$
z6b;$Zt81wvCRyq~$2m!){O@vOl3`<N8nSFvnESVu6F2v&XvKe3!YWxQ&?!Flq_gGD
zd6du5qBEr0|5MT}xr}GAhpx0gSfOlZ{BML{In@g>g)2?@QZ4^eHrbA%To97ClO=75
zJ}d{=e#f?Dv41i955DbFPOHx!&A2bkv1C)Tf|-@a6^ITPYyD5L9lf-)uf6sHMX|Gv
z?@J<n`dN>u^!IPu%uD(mgypKD(A^SJu#dy!giDLj<d9Ar^>b@CFY7OtiY~zRu~k~l
zY7l&@=68M>=eERGkOLh36bf@Meq(wT0al7V)_eOPj?DTC&*s!6FL25-2=+ZPP0}?T
zl&L#AJ@#QL{N16a^p|tr+M5iB{*Me@Ixo*jALrn{pBli&>3(DKai`-{JzjECN%uXj
z#Pt|&8k6EL!-?mYu(dhDZrL=QWcF!j0yq2c^)RKIA}`Dg*fhWV70?r+R3{2{VhB&P
zjgIV6>UXa*oY2WC^R}wD?-<xG&zc<V@aAl)7C-k?S<I@S0LDU8h<_n?b(Qh8_x4HB
zD{T4TI#M@lbU!E>G1V8o6;0o$^BeBNu<$mo#)<9a6+2J6fiR^r!andQzjecK5)Hfj
zIY`mrN+6eN<H0fFK=&_KK<4(i%5%&a;higW>5c7?{Gjv8tJl+New;~ue(s@R8iF+~
zpFF08B%92fMCVr5ifdbgk7)$?voz<J)dWxSFC*uGx7{!UgLh!+6RsTX_U2~#gQKqX
z`Zd4h&M1+lj(0J%Y06i}9Pv@TOjh2`)cJQBl6~Ls(G;m+T4=NGhez3$JqJGTUF*=M
z*|8A*#F=eE*bSViUvt@&RYsbOVeV5;va|@<3)7u42sw#!*`L&F=7kOT>h9)h?0!YR
zO#n(CN0=^;?udTJB-%BAt!xw+U*;$COi2pJqRe8XyX#pYpx&oRaO#Ga<nf-%&hHk~
zzwj-pb(LkOJ&?&vFU&^6TKOd#d&a}i!6J&72erMrqv&R-q@HLpaL7-VL7r66Q|!;m
zK$pwW?EMlANQsGN7dp9|n^WjOAS2?VG`fctiPJ`S;-8CGC8+D>t4%&i0v+fff|d#H
zeBFZ@JE^w?OM?0D<}QyhX*27wITFP-wXw;ZX!b&C!x5Il8+e^)@bmg-MIjU8uY?F~
z&lxU-zSUXVjy@>Ry6?n%I6`0cKGA!d5LJVnXj{R<FECelA|Uw!dTT{!m+#Tb-H5Yq
zO}`57G3Y5F2xi$Etz?!4KOX<BnU3+7Y?aoT$EzdsRuq$SA^H_G#`97GDOXpW*m!tA
z&2y6$e6M3JAva#J{7#}hW4n1{X;<O?z#6>B&&p_XifZC!DUj#i3d2DBB0z&8!uY;k
zoi%i*?d5w+Btpy0=rn<<<C)K!Bkr}!sIw?bdN@Mo^15j*Ys}4b^H|WvGc?cizNY(d
zaCQH9kcF0?+_b~&MFr&LM~_J<0t3V3kq{R62_?5`py*n$pQPOSNUF5$bS}j6J|d0p
zv^b|#Wl5zwJ7Z<TucWpT-E$RUtfh?eclo;6WPaVeB(vBpUaZ<lQ>n8xT>kuOD5t=C
zt&00P{F(RUaNlB3?;f=QkFvgy+Qru;X#DO@xP4#vW%K67wchwm5Z2^<I-Xw10cHA7
z4P(2n^;I+W+E+F8L4@*^oD;QkEzGqg>B}WDTAE~>W4>P)Bk6;j!=Gnk*|uF}!9JLT
zJ(XW0kJQQ+u%3Jm>1eQ<HvL0_XehcEDZrTD)FvI$)r4Nr#F`k5ef)W$U-^^OX6V1x
z=qaP5-5)-Bq@m#mS;@QCCwBhKEg$9lY0arcWz_8%l>eeI+0d97nhhkbGDN5&HCd<_
zIp`gAX{%id3qYYwJ6`|Q0B9L0TI#I#d&Rr4@@!?joCLW)AY<22+^HA&nrGraz#^_-
zA9q^u`l9~ji52&aC_6=V>Ec+Bd`H)^f2d8c$2EO))O3V`u&5}j9szgxDL!jL3_-09
z2vva283JVR&P&V6jmGs_3{!rq;$!;AMxG-Xu6~~eZ=FMYY0F)GN=b{nmRoHE1r}tX
zKYDVYc^Mc1^Da-`;XNK_?kYI3P0eGNc}@uOS`^VCe1h2P<Zl3m{_IrodIb<Gy?@x?
z4^SIO6B8R7`+7|>JXWTa96@*dAAqj1wpUk@BV+9Ba<15e#W*c4JUx$!R6+#-$=LXk
zGksqnFKx?>ELGIGaMv$sBXxQ<`8S7>hoh~$$pN}!pOhT{zI^O4^=Zh}R#G$)+#4o4
zey#Sk{-dzTjE|Y`j9cRrZ`bQ&$+x`2)DfM31Ph$)+Cfbv=4=9pT6jt$O_8Gy<9NW+
z24=DL;9EyLv-6c-by5ZLy{<rjt~q=wa&PRjmH#)MKeNAj;rstU$P13BU$hKPT;Ymn
zYCg3l+Hq(bHuEC>upOM!Z&0&wpf<j5>~BJxyY3^<0K;xUp4)LQOeOe4V{T1d;GR^Z
zJepmWO#ZYyog38i(5YBg!I~jMOV~)WUI>khJhQU^!E)Dda{^X!K)c8L91up3Q5Oii
zvTah-J**`49$lZfsVo)h%t<+)obVJ728s0by4u!K%vT73g;K=pi618metvr|M<B1U
zz!zEbq@>kLRu<b~0`SOKfu3P?WLtGYrlDE%u9wNzCGUK%kfjUqaDkwpBjeJ!yb#CZ
z&1BIt?Ja(5FnY7vFTVw8kZVG;ThVkbt}>N22KP#sTu~U``!KH+86IoXPUYm*0<%i+
zq#kbkuw)PrZIEK)wZ^J(?XK!}<th<5Yt7Pt16;c0*O?}nwpiAxo`vg5Dgz&Kv>aU~
zkjV!_Pk-FU{CsAu<mf&87Xul*&Toq?V;Lt}cJ>AJGS-`mGoVRg_bN-Aa39|85>(X{
zfUMokV)!}NBi~q4r3To_+mQ|r^Q;6VgwJEV@7qk$5rUXQ4D;4f80@oiPZp1hcS}SA
zp(EDfJouEuWNO^a6KB@Bqpj5=Al}6BIjzl5E10S6Hh4G*BB`lBR@#j{yDp-@i>c_V
zBqdo#x}34|-7Y$sZYDk9b%m9RW)TK>E1f00OU<@F9X1;QQ`~W0KWlH_*){FOeO*?R
z;-`vsn89x&9&DwFW251TOJ4Yo9R`mg4YY^0RC_(f{l13R@TJN7wu%(m1zvk;>hD0o
zj**$E+6csG3JL{usvtvk^+e+|Fbz$q7RB(faz#690Z$o`mQt7Ca>7xGahf3R(s5Xd
z5h_w8VE5KfXRbMAz6T}wviHr6Wtd@wHPe@)<@L1o_En)8S<+Z5v|*UhFR`dJu`&_^
zemM|{C2)37qNJaLBslk&V5pluBYma7Wf+Q$g&_XS_zi9cP8{ybpd>^b^*1(uxT3^-
zhTl0is7ajdf6koPnOnItkw3oZd25+HJ#9TF&wFoO_}6*YYm1cm@6WUPUx5I(o9EsK
z8(sWGt&ITZ2oWoHy^QT=o*g~Axo&zNmPkBj-z{;qb}pK8Z*Zese2J!Dj69?_xt3im
zC%}Fy0Yt?9N2sj*cVloI8;WkX<sMpB0E6s;{q>$mc?P7L>$Elb`p_q%*N%o`c361$
zUa+U*b6CF)W9V|9iN4KF)jrjVPcE3yJU7$(p5LB5M-wS$DJ5zdvRI8CoA|bdzWXR8
zrKTv&*H+&X;P>NcZh$3c_-|@k^bV=V&hH=SyuMSyg8`7bg3BdSVc#*HN62@2uK~{M
zKCqYF0yyvWw>V8Vw&~ZHOg;z)skf6aTc(!W*Xia?Q6(kJ^S*d`S?>Gk<s6(bi<>U*
zPp_`P<%cpH&&%^pm#B$)WZVdzTbh#2Mjc>}+H3?n-MA3M7rTo!NaBYgZ6qJx`rrc7
zJ;sZsDE?(5eK>5D3hHgZ?(P#BCI^yQ&F{b>x-s|HdHaiUImj<FAn})zWd$DVn=M<m
zZ&S!y>nYT3JE!6zdtM{25N?~@-~EEZu5EQpR|g<Q!0y}@QNZ>mou8MH^}>J^jH<nU
z%DrzWU5M#(&j=H|T>OZp6_omW*kP`^c>VRrG5XWmq~&yGYDY=u?#}3otRwT+kYLkg
z`Z@XfM^8*_ba$Jax4PiaBzL$@Ig|4YVE&JT-K^Fdg%IGd&&U#E{L|?YzWAz@qlW9o
zNN?^SMz2>&ME+M|UM~Fm)Sr59&oFqQ113Rwog<IvpPC$k!ZdY&2CSurx;j0BM7yRR
zA}o^Xz*}kFfB9;<l_<mCkI>4#l{kBA_S!zVy>Q+Zh0w?sP45Hs!<zh#uZCIqz26`0
zVtf0(`W(&=yps<ZauD>e6z+Msh$9>7Rk-kOni=lNk>oUaS8HAqlfdPCJpMxdjdl0(
z-irB>-gcQ|Hy^j-HLp4}m-GI{%j>-%3d0w=oQw5fmNmn@G+4Jyr#mNcsk;5w4kzzJ
z_W+a6OB8BHi`I42WV>jsum82`SB8i1FoLtoiM`0VDoY?FJfh|C3U{=oZ&SGTvyqWk
zZ{Ex+dJ?M8vlUPKXo)VI|ML97?kK&EPFFl6+nbjuqRM!8$2wV-H}}s?9N$;2nbryB
z4d;BQZ`lRAV|i?6uY**<7O4BVR60I+pB$^B>k%y6{tSNGz)Ol%LFD$aG0aY2`N#%k
z#eP}4$r1YnQ!S_2XNY=pJ84q)d)w8;GXb_Q--j0jDr=Q37%!yllAwI2Glm7UD#Y4$
zQ${5%X0Tbw$n!vEH*9(}mAn0-484^?#@4y}L(pJyQijLjdx*2?!S}(^r6)8T&jGMM
z{z2Dk;Lw}4KP<5K@4=t%RrMv|kH!f)SuTgTZO{+2eLJ`BIv#0Woxu0rSuVG1{IB_w
z(yx4V=Sq}RL1|c%nBRc?QL138Y#YIYI^M_xpJVhceBZtzuivn)y%!>K@<bAXp|a&W
z!*OSO;RB;IZbujRZGOG>g5K+->>dX<yLRu}LS2Kw-A}+=I-1a)OWT{a!VwtiZhd&}
z2tw8`alHv_-#nJAff9$xIQtyum(qC=M680DpN{8B<dQnxwD9CQ4Cbk~8)(U@`UAtu
zCYcSM8#gKqxE-3F^8~!7OWbMPtAC{+VKqI};hwE<{gQLfbz7gRGAfDI>@?d-h$>@Q
zO-Z@m$}J6h&cf~{79*KT=>@Rzsy(6Pw2jR+1lNgho<~NnPP$`!t$qz-^g1Kcn;t}H
z(ig#}KSNqeopSbm3=&!E)Y-W^5VgBwhKNW$R%?`YoV0@4r{4h*gN-rUO%g!d^KyqH
z_!8;g98`HYqm<R*cDy8^UPOk0Y*_WNuw~Uvwlqyi#^rK4x3;=N#Oy{HyAG%AE|p2k
zslaSYMs5Qi!Tr9dH8fz{x3L7nvLkLMGj^qB_p<6zY$rIbTX@9ywU71{Xj))0IW+x2
zXm48P9#z!Y2neA;+gxV)H7<m@2Mw*|C`4bGK@X<Xg;~0l?h1Iw`Gk3W-DKUqotbup
zEY!P#;B4i4g*oxYj%dlAcv)e5>l2dqO0~&$-VcFi2}`r)GuD-j4=m+qe|uFZ;w!tA
zjx_!3t>%8Mt9Sid?D7Iqx3||*%+DX~a(T3JkUiz-#~IU_+huywZs}&PkfZDXr>`9&
z-9w)zqY&b4Hb2Uu&$*7;{R?S_*YqnURi@u2rz@n5I`P!|GSKk)Ft2I1l74#|l6dMM
zBhs<56}~9rXK?im^Fy6C``MBgc-H%f#Ob7MFrkI)?94*Z6a~s{TcGc+H>)4lIt!yP
zu&~t^5B^~|?SuXS0S{9dZ-4H;b3p&qnfSMLYqBga(l<&oKb!*htzgCH#m2HFCt%PU
z{q2s2Eq#jf(7@@vdY7{C+Ux3RO!gDhv2FCu2?h(#-6>J>KKj$o82aB*ZEfyQa`Z?2
z$=9qweDk|`vYA|9xO_~B9<Rf!Mr~Ppe9<A0u&|5_3}8P5;ZU$+RQIElpa!t!R@QvJ
zVI%^2^pvTUBr>^fDW~5bu81n%GVsfM-^wA*v^JSLK&wa_ybxXHJ}}E-cqbvjdxJ%0
za+w~VyzLsp2s*NRgOdP?*@qydBH4CCUPL9{ful+}uG7cJ%4MI0NK6ShI9qn2>oI12
zyek}fX1j?a=(Y<$5M&7W4}<Lx7B3%TaMP^BkiZ+FK5HFdqhESu@%H3L0-Y4FLu|d9
z1H0*4UN!cv$&jOTSNrTKX1oEk1XrphrvuC^R|hp$XZV9#+3QQD$(Qm;(|~s|5?|3t
z^be-?_e8`KP;Z-J6I16-O&t$zJZ)i9nT$sgDU}MP(<WwahNi7nC!2zQ7KJdc4PMG`
z`YbGI>|dk#S=mUGb=rJ~T%By8QQRF*`-lE-#iMeW+Ah|7u1s^MN#aai;oXpQn5&Ov
zr|3h}&c}1Dnzjk%tRdz(lPoPnio{Ug3m_i^<-q`}hi&bXhzB3C{UG{F-41f1G(Ln^
zAGSj*S->M-_0B-*6Sl~2;XungB*uI6!mkGe6t{ow#Q8tkRtC=F{<%JSH<@J3*8t8N
z6+UzLd}QrWj8cJ{c5<4H{hixeLp8K_tn2hW?SGi8nQJJcz`oQa{t|2{?rgD|Bz<pB
z^(9wcQ8lCF2y`k?uOd{Sk$If^PcT+2WsYLSZ;i5CU}L{)_Tv^dlO*@OJ=fG@Q@k+C
zZ`N)&LWZJF46zW=?}$Gp53qe^=TSmg;Qjs=SD79uhe;NQ$;SG!zPf+tA0+G$vF6I=
zvB%iX3<qN?%K8}3#K#k*)dZ@|^kjVrXhsK8&GS^s9)G$<MpcK)XZ`PXo+4)ope*XK
zrDubUms4jqLky;WsF=KBD@t;(!WNqM;tqOezyWrxWE}EEfBVpzK=uFjhYpU8Gc;tN
z#hGVx)X0YZ*T8v780b0GX?B>pxSDMaj%&gOB;<A{D1tioJ`NY-dPWh<SyL5k2C~cx
z{k3kx5i&zn5}LAETk}Ua+cA$X|CjZp5Sh{Ew=xu^o`s1-;(s|^W<)56S(oo;rjh<5
z;u2x=jnjWtUx{A#U_OgQ`L{u%8S|%LE1zCDRj)^~{~p~jAt@0zgChSC`4=Ypp@A4-
zGBQP>TA_bz>~IpJ9IrxR(uk9j|56BYiKzI-t<2@0rNV^#`zSsskxy_>>LVThUx&mO
z!J$a9?l}@$s|YR<tZTr3)#5|%Q7c>{hl@+XmQd1)2c9vWu>U0&r)GMDFY)4>I5l;m
z`-kzs*f65Z%_%NpX$XBnJ{rIOy7`vvYj!X^QXPk_Y`RR>MS=7$3;AD<yoEBv%&T9e
z$#lpc^}+oL&Fyf>_rAdY`+qI;pnZ<HK6m^TAvn{6;3)nEJU1?urN`AH?ujI;;roAG
z+6-3yjGH`|`)&0Che0*{^D9@4y)+zV3({z*({J|whKixD{uQ%jBkuYkY<03m)V}A|
zS$i<mTu?F#_-|){P2!Y{CV(ktUipp6_n{Z@gNvZ}iFq>sOfSQSa}sdN^j~*$#%@OF
z!TNy{b0pbC5==ZAWuJ~M^LbQ>p(G1*BGcHUXKlO@|LaxN5WqGdjZ>fq^Z==sU9)`J
zj3ZJ^um+`C#(GU-gC~x9b9gz04g~)rr&#q!^OcX^aC;5}b$;1#)?WNFvyO0s+!w#O
z?ufw7c_Pie&9%bK->V|jT~?}=s4(#G{L1<r?VwvyK|lW=7h^p6iJ16U{U1|8>r9AH
z7j7xYuOx07O!hl{fHet3A(@e|S95T(nZ++>h?XMG=)Pk2&&lSwMQVZi=s6fy;Rj0p
z<?k_Jir($2{P`ia$TYIZ-~Ni#7=ZoLlPk`o0V!LM`VCJMJz8nP`rN~(-rKxFvM(+0
zX0jEZMk~7-tbdlV`UsH!J0R`gLi3>m$0=vLU>(2Y(N@9NvB=5chddSqCA5VAV{j$N
zK3f3<GkeW3LFeC^Qvk?&O9>Nk$xS_C>o3R13hQNPET-FOL8HAj9_hFQO8>Z-1j!;E
z@^F}9`x@iot}d|9Q9h?UL<UA4zOlBH@>s|kRq-d8!_!^nwb)|o$DlE>N=yUf!M`vH
zsQ+fQ9}qRxslmb#2D#nN?US)k7F3oOQek4SNdAK^bM{0w9x{E)td8ny=pPjmGWms`
zDT;bwq1{(;yaI*nBKULwkK!&?xZimgAV)3G&NdvKQD&_p@C)MgjN55*a}`V<7J&#J
zB_VX8_3p++*J1u~^*lUTe$=KhACLcEnDcSWc9G|?(u)_Pj%6WC2JV$<O2uW{((FO`
zjNH`;VVD4xhX9grrkEf31MD3p&`E6i)7T_d3J5qDeGSU<@X7NLB4$1nKIt=^;E^r=
zDo&lz^RCMk*X*Y1(vUE)fPwx~@6P0)sF}fv?9apj)qEya%7&cBPt3dwInxp#YcndD
z2b1YzT$%jxJuGaLYMNbUvj5#wUFX?n9RG_uCg=8t3X|@ZX}D<N8gmI#GmU!XquXkc
z@EQPi_JU1a;l>Y?IP-b7<4?OQAttzVlTN{87A`#uRR1nNTry>cQB<E2)^v<Dz8IRU
zJxx2iY6zh%)M8^hCl2V+PtOJJ&zEB^K-?1!q`KYK2wOrtk{F^*=N{(iWS|#>aq`c8
znzkBE{rry_9kI;3Q;lD-^mC@yeSks+j<sBv%sPT<Xn}4bl7?)}8&Q5ZUuY8lo7EaS
z-9x7x4>8VTd?yhF0F~ZT{BIL!RPlN2$Ice{G6>9Q6fsupjv(j0slJBEZw256(oyVh
za3AX7UlWa;z8{p3$Ce8&$|5WsT_X!MVEpge_%p6eMH!Ga^~5&cP?vfr-83PL2`E><
z2Ky4NgzO&JBFVyfDq=}wjT5!P&h}PQw@+3oWNwEF`gcAtiPU*%Xh;!5=Cd=8x8O0e
zRC^o6%z%XzlKCJ2%EW?LrUBxsq5{;00jaf&!~0}v>^u=oT!dUVuZ%3B|LA7E;igP#
zRYPQ_=vps?A^>aCxfTmMN!=XfLs%lHne{i)h&fsyS0W9x;&eS94XpGBpWgu~Z<GaG
zV$ATSW?Z+UQvWbw!iT3}Xu@ll$E8cWTlaH1O<p}RSEBK>e4HZc2encj1HlDOx|#lO
zEOP}pYpY)YQaJVS$z_PLO|r3<X<tPDm4>*+071LNV=UNrJ)=4288K4`;}S%fVw!55
zD|LWJy0FqNj<4UsG(duh4XYtd<g;wCwbuFq1H`||K^A@kno%?SlNQAyO7?rWkj*@p
ztk6a)5#~b7!jCPF70uQv{#hvBMjioCy5u=By2XPia^zoCmrdaG*ky*;o;N>+g@z(Y
zO0yqL8ZeDVjM7C0V1KW#hJUETd}r`6+*P)iHm&8+;4cmm!zUfip*RPFWc^2uM!#d@
zFv6<S7etG-iFYmR6*4Ap0O-iD7s^A_qUqr|_3}7~Uj;)88B1V|UW~`Xv#R6#tgw?z
zw4sjKwK5Vn$^)#iiK9}JW0fUd%}lYxxxiolc}owCnG)S!z!5T4VhLR$>|v#wwC9J)
k9w5q~%Gk#4`@Hd|$%TIV22mpm_VJMrkrl2K)cf`S0PfGvBLDyZ

literal 0
HcmV?d00001

diff --git a/docs/source/using-diffusers/loading.mdx b/docs/source/using-diffusers/loading.mdx
index 35f1e0f928b6..2cb980ea618a 100644
--- a/docs/source/using-diffusers/loading.mdx
+++ b/docs/source/using-diffusers/loading.mdx
@@ -71,7 +71,11 @@ If this is a private repository, make sure to pass a token having permission to
 Therefore, we need to make sure to *click-accept* the license. You can do this by simply visiting 
 the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) and clicking on "Agree and access repository":
 
-![access_request](https://github.com/patrickvonplaten/scientific_images/blob/master/access_request.png)
+<p align="center">
+    <br>
+    <img src="https://raw.githubusercontent.com/huggingface/diffusers/main/docs/source/imgs/access_request.png" width="400"/>
+    <br>
+</p>
 
 Second, you need to login with your access token:
 

From 72eae64d67a4c8430aced38faaf8c09f79012469 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 7 Nov 2022 20:57:45 +0100
Subject: [PATCH 143/269] Fix dtype safety checker inpaint legacy (#1137)

* [Stable Diffusion Inpaint Legacy] Fiix some things

* uP
---
 .../pipeline_stable_diffusion_inpaint_legacy.py               | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 91dcefc91dcd..a92e23bfc18f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -414,7 +414,9 @@ def __call__(
             safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
                 self.device
             )
-            image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_checker_input.pixel_values)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+            )
         else:
             has_nsfw_concept = None
 

From bcdb3d594c7414f3465c083e2058148c1715ec2b Mon Sep 17 00:00:00 2001
From: Alex McKinney <44398246+vvvm23@users.noreply.github.com>
Date: Mon, 7 Nov 2022 20:06:52 +0000
Subject: [PATCH 144/269] Community pipeline img2img inpainting (#1114)

* adds image to image inpainting with `PIL.Image.Image` inputs
the base implementation claims to support `torch.Tensor` but seems it
would also fail in this case.

* `make style` and `make quality`

* updates community examples readme

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 examples/community/README.md             |  36 +-
 examples/community/img2img_inpainting.py | 463 +++++++++++++++++++++++
 2 files changed, 498 insertions(+), 1 deletion(-)
 create mode 100644 examples/community/img2img_inpainting.py

diff --git a/examples/community/README.md b/examples/community/README.md
index bb3964e1a7c4..a7a9ab807a06 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -17,8 +17,8 @@ If a community doesn't work as expected, please open an issue and ping the autho
 | Wild Card Stable Diffusion | Stable Diffusion Pipeline that supports prompts that contain wildcard terms (indicated by surrounding double underscores), with values instantiated randomly from a corresponding txt file or a dictionary of possible values                                                                                                                                                                                                                                                                                                     | [Wildcard Stable Diffusion](#wildcard-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Shyam Sudhakaran](https://github.com/shyamsn97) |
 | Composable Stable Diffusion| Stable Diffusion Pipeline that supports prompts that contain "&#124;" in prompts (as an AND condition) and weights (separated by "&#124;" as well) to positively / negatively weight prompts.                                                                                                                                                                                                                                                                                                     | [Composable Stable Diffusion](#composable-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
 | Seed Resizing Stable Diffusion| Stable Diffusion Pipeline that supports resizing an image and retaining the concepts of the 512 by 512 generation.                                                                                                                                                                                                                                                                                                     | [Seed Resizing](#seed-resizing)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
-
 | Imagic Stable Diffusion | Stable Diffusion Pipeline that enables writing a text prompt to edit an existing image| [Imagic Stable Diffusion](#imagic-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
+| Image to Image Inpainting Stable Diffusion | Stable Diffusion Pipeline that enables the overlaying of two images and subsequent inpainting| [Image to Image Inpainting Stable Diffusion](#image-to-image-inpainting-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Alex McKinney](https://github.com/vvvm23) |
 
 
 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
@@ -501,3 +501,37 @@ res = pipe_compare(
 image = res.images[0]
 image.save('./seed_resize/seed_resize_{w}_{h}_image_compare.png'.format(w=width, h=height))
 ```
+
+### Image to Image Inpainting Stable Diffusion
+
+Similar to the standard stable diffusion inpainting example, except with the addition of an `inner_image` argument.
+
+`image`, `inner_image`, and `mask` should have the same dimensions. `inner_image` should have an alpha (transparency) channel.
+
+The aim is to overlay two images, then mask out the boundary between `image` and `inner_image` to allow stable diffusion to make the connection more seamless.
+For example, this could be used to place a logo on a shirt and make it blend seamlessly.
+
+```python
+import PIL
+import torch
+
+from diffusers import StableDiffusionInpaintPipeline
+
+image_path = "./path-to-image.png"
+inner_image_path = "./path-to-inner-image.png"
+mask_path = "./path-to-mask.png"
+
+init_image = PIL.Image.open(image_path).convert("RGB").resize((512, 512))
+inner_image = PIL.Image.open(inner_image_path).convert("RGBA").resize((512, 512))
+mask_image = PIL.Image.open(mask_path).convert("RGB").resize((512, 512))
+
+pipe = StableDiffusionInpaintPipeline.from_pretrained(
+    "runwayml/stable-diffusion-inpainting",
+    revision="fp16",
+    torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "Your prompt here!"
+image = pipe(prompt=prompt, image=init_image, inner_image=inner_image, mask_image=mask_image).images[0]
+```
\ No newline at end of file
diff --git a/examples/community/img2img_inpainting.py b/examples/community/img2img_inpainting.py
new file mode 100644
index 000000000000..f7a107136d89
--- /dev/null
+++ b/examples/community/img2img_inpainting.py
@@ -0,0 +1,463 @@
+import inspect
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+import PIL
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import deprecate, logging
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def prepare_mask_and_masked_image(image, mask):
+    image = np.array(image.convert("RGB"))
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+    mask = np.array(mask.convert("L"))
+    mask = mask.astype(np.float32) / 255.0
+    mask = mask[None, None]
+    mask[mask < 0.5] = 0
+    mask[mask >= 0.5] = 1
+    mask = torch.from_numpy(mask)
+
+    masked_image = image * (mask < 0.5)
+
+    return mask, masked_image
+
+
+def check_size(image, height, width):
+    if isinstance(image, PIL.Image.Image):
+        w, h = image.size
+    elif isinstance(image, torch.Tensor):
+        *_, h, w = image.shape
+
+    if h != height or w != width:
+        raise ValueError(f"Image size should be {height}x{width}, but got {h}x{w}")
+
+
+def overlay_inner_image(image, inner_image, paste_offset: Tuple[int] = (0, 0)):
+    inner_image = inner_image.convert("RGBA")
+    image = image.convert("RGB")
+
+    image.paste(inner_image, paste_offset, inner_image)
+    image = image.convert("RGB")
+
+    return image
+
+
+class ImageToImageInpaintingPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-guided image-to-image inpainting using Stable Diffusion. *This is an experimental feature*.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warn(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.FloatTensor, PIL.Image.Image],
+        inner_image: Union[torch.FloatTensor, PIL.Image.Image],
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            inner_image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be overlayed onto `image`. Non-transparent
+                regions of `inner_image` must fit inside white pixels in `mask_image`. Expects four channels, with
+                the last channel representing the alpha channel, which will be used to blend `inner_image` with
+                `image`. If not provided, it will be forcibly cast to RGBA.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # check if input sizes are correct
+        check_size(image, height, width)
+        check_size(inner_image, height, width)
+        check_size(mask_image, height, width)
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""]
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        num_channels_latents = self.vae.config.latent_channels
+        latents_shape = (batch_size * num_images_per_prompt, num_channels_latents, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not exist on mps
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(self.device)
+
+        # overlay the inner image
+        image = overlay_inner_image(image, inner_image)
+
+        # prepare mask and masked_image
+        mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
+        mask = mask.to(device=self.device, dtype=text_embeddings.dtype)
+        masked_image = masked_image.to(device=self.device, dtype=text_embeddings.dtype)
+
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        mask = torch.nn.functional.interpolate(mask, size=(height // 8, width // 8))
+
+        # encode the mask image into latents space so we can concatenate it to the latents
+        masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
+        masked_image_latents = 0.18215 * masked_image_latents
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        mask = mask.repeat(batch_size * num_images_per_prompt, 1, 1, 1)
+        masked_image_latents = masked_image_latents.repeat(batch_size * num_images_per_prompt, 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
+
+        num_channels_mask = mask.shape[1]
+        num_channels_masked_image = masked_image_latents.shape[1]
+
+        if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                " `pipeline.unet` or your `mask_image` or `image` input."
+            )
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            # concat latents, mask, masked_image_latents in the channel dimension
+            latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
+                self.device
+            )
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+            )
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

From 01733238a67d6a23a9c11349cf799172df60597d Mon Sep 17 00:00:00 2001
From: JuanCarlosPi <juancopi_81@hotmail.com>
Date: Mon, 7 Nov 2022 15:11:59 -0500
Subject: [PATCH 145/269] [Community Pipeline] Add multilingual stable
 diffusion to community pipelines (#1142)

* Add multilingual_stable_diffusion.py file

* Add multilingual stable diffusion to examples README file

* Update examples/community/README.md

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 examples/community/README.md                  |  72 ++-
 .../multilingual_stable_diffusion.py          | 436 ++++++++++++++++++
 2 files changed, 507 insertions(+), 1 deletion(-)
 create mode 100644 examples/community/multilingual_stable_diffusion.py

diff --git a/examples/community/README.md b/examples/community/README.md
index a7a9ab807a06..b83b2ff4d45e 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -18,9 +18,11 @@ If a community doesn't work as expected, please open an issue and ping the autho
 | Composable Stable Diffusion| Stable Diffusion Pipeline that supports prompts that contain "&#124;" in prompts (as an AND condition) and weights (separated by "&#124;" as well) to positively / negatively weight prompts.                                                                                                                                                                                                                                                                                                     | [Composable Stable Diffusion](#composable-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
 | Seed Resizing Stable Diffusion| Stable Diffusion Pipeline that supports resizing an image and retaining the concepts of the 512 by 512 generation.                                                                                                                                                                                                                                                                                                     | [Seed Resizing](#seed-resizing)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
 | Imagic Stable Diffusion | Stable Diffusion Pipeline that enables writing a text prompt to edit an existing image| [Imagic Stable Diffusion](#imagic-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
+| Multilingual Stable Diffusion| Stable Diffusion Pipeline that supports prompts in 50 different languages.                                                                                                                                                                                                                                                                                                     | [Multilingual Stable Diffusion](#multilingual-stable-diffusion-pipeline)                                                                 | -                                                                                                                                                                                                                  |                        [Juan Carlos Piñeros](https://github.com/juancopi81) |
 | Image to Image Inpainting Stable Diffusion | Stable Diffusion Pipeline that enables the overlaying of two images and subsequent inpainting| [Image to Image Inpainting Stable Diffusion](#image-to-image-inpainting-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Alex McKinney](https://github.com/vvvm23) |
 
 
+
 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
 ```py
 pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", custom_pipeline="filename_in_the_community_folder")
@@ -502,6 +504,74 @@ image = res.images[0]
 image.save('./seed_resize/seed_resize_{w}_{h}_image_compare.png'.format(w=width, h=height))
 ```
 
+### Multilingual Stable Diffusion Pipeline
+
+The following code can generate an images from texts in different languages using the pre-trained [mBART-50 many-to-one multilingual machine translation model](https://huggingface.co/facebook/mbart-large-50-many-to-one-mmt) and Stable Diffusion.
+
+```python
+from PIL import Image
+
+import torch
+
+from diffusers import DiffusionPipeline
+from transformers import (
+    pipeline,
+    MBart50TokenizerFast,
+    MBartForConditionalGeneration,
+)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+device_dict = {"cuda": 0, "cpu": -1}
+
+# helper function taken from: https://huggingface.co/blog/stable_diffusion
+def image_grid(imgs, rows, cols):
+    assert len(imgs) == rows*cols
+
+    w, h = imgs[0].size
+    grid = Image.new('RGB', size=(cols*w, rows*h))
+    grid_w, grid_h = grid.size
+
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i%cols*w, i//cols*h))
+    return grid
+
+# Add language detection pipeline
+language_detection_model_ckpt = "papluca/xlm-roberta-base-language-detection"
+language_detection_pipeline = pipeline("text-classification",
+                                       model=language_detection_model_ckpt,
+                                       device=device_dict[device])
+
+# Add model for language translation
+trans_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-one-mmt")
+trans_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-one-mmt").to(device)
+
+diffuser_pipeline = DiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    custom_pipeline="multilingual_stable_diffusion",
+    detection_pipeline=language_detection_pipeline,
+    translation_model=trans_model,
+    translation_tokenizer=trans_tokenizer,
+    revision="fp16",
+    torch_dtype=torch.float16,
+)
+
+diffuser_pipeline.enable_attention_slicing()
+diffuser_pipeline = diffuser_pipeline.to(device)
+
+prompt = ["a photograph of an astronaut riding a horse", 
+          "Una casa en la playa",
+          "Ein Hund, der Orange isst",
+          "Un restaurant parisien"]
+
+output = diffuser_pipeline(prompt)
+
+images = output.images
+
+grid = image_grid(images, rows=2, cols=2)
+```
+
+This example produces the following images:
+![image](https://user-images.githubusercontent.com/4313860/198328706-295824a4-9856-4ce5-8e66-278ceb42fd29.png)
+
 ### Image to Image Inpainting Stable Diffusion
 
 Similar to the standard stable diffusion inpainting example, except with the addition of an `inner_image` argument.
@@ -534,4 +604,4 @@ pipe = pipe.to("cuda")
 
 prompt = "Your prompt here!"
 image = pipe(prompt=prompt, image=init_image, inner_image=inner_image, mask_image=mask_image).images[0]
-```
\ No newline at end of file
+```
diff --git a/examples/community/multilingual_stable_diffusion.py b/examples/community/multilingual_stable_diffusion.py
new file mode 100644
index 000000000000..c71c1f10c588
--- /dev/null
+++ b/examples/community/multilingual_stable_diffusion.py
@@ -0,0 +1,436 @@
+import inspect
+from typing import Callable, List, Optional, Union
+
+import torch
+
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import deprecate, logging
+from transformers import (
+    CLIPFeatureExtractor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    MBart50TokenizerFast,
+    MBartForConditionalGeneration,
+    pipeline,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def detect_language(pipe, prompt, batch_size):
+    """helper function to detect language(s) of prompt"""
+
+    if batch_size == 1:
+        preds = pipe(prompt, top_k=1, truncation=True, max_length=128)
+        return preds[0]["label"]
+    else:
+        detected_languages = []
+        for p in prompt:
+            preds = pipe(p, top_k=1, truncation=True, max_length=128)
+            detected_languages.append(preds[0]["label"])
+
+        return detected_languages
+
+
+def translate_prompt(prompt, translation_tokenizer, translation_model, device):
+    """helper function to translate prompt to English"""
+
+    encoded_prompt = translation_tokenizer(prompt, return_tensors="pt").to(device)
+    generated_tokens = translation_model.generate(**encoded_prompt, max_new_tokens=1000)
+    en_trans = translation_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+
+    return en_trans[0]
+
+
+class MultilingualStableDiffusion(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion in different languages.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        detection_pipeline ([`pipeline`]):
+            Transformers pipeline to detect prompt's language.
+        translation_model ([`MBartForConditionalGeneration`]):
+            Model to translate prompt to English, if necessary. Please refer to the
+            [model card](https://huggingface.co/docs/transformers/model_doc/mbart) for details.
+        translation_tokenizer ([`MBart50TokenizerFast`]):
+            Tokenizer of the translation model.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        detection_pipeline: pipeline,
+        translation_model: MBartForConditionalGeneration,
+        translation_tokenizer: MBart50TokenizerFast,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warn(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            detection_pipeline=detection_pipeline,
+            translation_model=translation_model,
+            translation_tokenizer=translation_tokenizer,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation. Can be in different languages.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # detect language and translate if necessary
+        prompt_language = detect_language(self.detection_pipeline, prompt, batch_size)
+        if batch_size == 1 and prompt_language != "en":
+            prompt = translate_prompt(prompt, self.translation_tokenizer, self.translation_model, self.device)
+
+        if isinstance(prompt, list):
+            for index in range(batch_size):
+                if prompt_language[index] != "en":
+                    p = translate_prompt(
+                        prompt[index], self.translation_tokenizer, self.translation_model, self.device
+                    )
+                    prompt[index] = p
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                # detect language and translate it if necessary
+                negative_prompt_language = detect_language(self.detection_pipeline, negative_prompt, batch_size)
+                if negative_prompt_language != "en":
+                    negative_prompt = translate_prompt(
+                        negative_prompt, self.translation_tokenizer, self.translation_model, self.device
+                    )
+                if isinstance(negative_prompt, str):
+                    uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                # detect language and translate it if necessary
+                if isinstance(negative_prompt, list):
+                    negative_prompt_languages = detect_language(self.detection_pipeline, negative_prompt, batch_size)
+                    for index in range(batch_size):
+                        if negative_prompt_languages[index] != "en":
+                            p = translate_prompt(
+                                negative_prompt[index], self.translation_tokenizer, self.translation_model, self.device
+                            )
+                            negative_prompt[index] = p
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
+                    self.device
+                )
+            else:
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(self.device)
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
+                self.device
+            )
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+            )
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

From ac4c695d97779cc4d0c92585f8e87ad39aa4aeb2 Mon Sep 17 00:00:00 2001
From: "Duong A. Nguyen" <38061659+duongna21@users.noreply.github.com>
Date: Tue, 8 Nov 2022 03:26:59 +0700
Subject: [PATCH 146/269] [Flax examples] Load text encoder from subfolder
 (#1147)

load text encoder from subfolder
---
 examples/dreambooth/train_dreambooth_flax.py         | 4 +++-
 examples/text_to_image/train_text_to_image_flax.py   | 4 +++-
 examples/textual_inversion/textual_inversion_flax.py | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/dreambooth/train_dreambooth_flax.py b/examples/dreambooth/train_dreambooth_flax.py
index 84493b1d9484..078a66e4acee 100644
--- a/examples/dreambooth/train_dreambooth_flax.py
+++ b/examples/dreambooth/train_dreambooth_flax.py
@@ -452,7 +452,9 @@ def collate_fn(examples):
         weight_dtype = jnp.bfloat16
 
     # Load models and create wrapper for stable diffusion
-    text_encoder = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", dtype=weight_dtype)
+    text_encoder = FlaxCLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", dtype=weight_dtype
+    )
     vae, vae_params = FlaxAutoencoderKL.from_pretrained(
         args.pretrained_model_name_or_path, subfolder="vae", dtype=weight_dtype
     )
diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py
index cacfacef498b..89a8dec7289e 100644
--- a/examples/text_to_image/train_text_to_image_flax.py
+++ b/examples/text_to_image/train_text_to_image_flax.py
@@ -379,7 +379,9 @@ def collate_fn(examples):
 
     # Load models and create wrapper for stable diffusion
     tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_name_or_path, subfolder="tokenizer")
-    text_encoder = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", dtype=weight_dtype)
+    text_encoder = FlaxCLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", dtype=weight_dtype
+    )
     vae, vae_params = FlaxAutoencoderKL.from_pretrained(
         args.pretrained_model_name_or_path, subfolder="vae", dtype=weight_dtype
     )
diff --git a/examples/textual_inversion/textual_inversion_flax.py b/examples/textual_inversion/textual_inversion_flax.py
index 84ff97c39a96..be2b7ffb5490 100644
--- a/examples/textual_inversion/textual_inversion_flax.py
+++ b/examples/textual_inversion/textual_inversion_flax.py
@@ -391,7 +391,7 @@ def main():
     placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token)
 
     # Load models and create wrapper for stable diffusion
-    text_encoder = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+    text_encoder = FlaxCLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
     vae, vae_params = FlaxAutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
     unet, unet_params = FlaxUNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
 

From fa6e5209a8d500893417d81e32c53aad1f8e9ecf Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Mon, 7 Nov 2022 21:59:36 +0100
Subject: [PATCH 147/269] Link to Dreambooth blog post instead of W&B report
 (#1180)

Link to Dreambooth blog post instead of W&B report.
---
 README.md                           | 2 +-
 docs/source/training/dreambooth.mdx | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index d3ac5702cd5f..5c7b911c2ede 100644
--- a/README.md
+++ b/README.md
@@ -346,7 +346,7 @@ Textual Inversion is a technique for capturing novel concepts from a small numbe
 
 - Textual Inversion. Capture novel concepts from a small set of sample images, and associate them with new "words" in the embedding space of the text encoder. Please, refer to [our training examples](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion) or [documentation](https://huggingface.co/docs/diffusers/training/text_inversion) to try for yourself.
 
-- Dreambooth. Another technique to capture new concepts in Stable Diffusion. This method fine-tunes the UNet (and, optionally, also the text encoder) of the pipeline to achieve impressive results. Please, refer to [our training examples](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) and [training report](https://wandb.ai/psuraj/dreambooth/reports/Dreambooth-Training-Analysis--VmlldzoyNzk0NDc3) for additional details and training recommendations.
+- Dreambooth. Another technique to capture new concepts in Stable Diffusion. This method fine-tunes the UNet (and, optionally, also the text encoder) of the pipeline to achieve impressive results. Please, refer to [our training example](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) and [training report](https://huggingface.co/blog/dreambooth) for additional details and training recommendations.
 
 - Full Stable Diffusion fine-tuning. If you have a more sizable dataset with a specific look or style, you can fine-tune Stable Diffusion so that it outputs images following those examples. This was the approach taken to create [a Pokémon Stable Diffusion model](https://huggingface.co/justinpinkney/pokemon-stable-diffusion) (by Justing Pinkney / Lambda Labs), [a Japanese specific version of Stable Diffusion](https://huggingface.co/spaces/rinna/japanese-stable-diffusion) (by [Rinna Co.](https://github.com/rinnakk/japanese-stable-diffusion/) and others. You can start at [our text-to-image fine-tuning example](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image) and go from there.
 
diff --git a/docs/source/training/dreambooth.mdx b/docs/source/training/dreambooth.mdx
index cf7e5dbcec41..238dcb24cf8f 100644
--- a/docs/source/training/dreambooth.mdx
+++ b/docs/source/training/dreambooth.mdx
@@ -23,7 +23,7 @@ The [Dreambooth training script](https://github.com/huggingface/diffusers/tree/m
 
 <!-- TODO: replace with our blog when it's done -->
 
-Dreambooth fine-tuning is very sensitive to hyperparameters and easy to overfit. We recommend you take a look at our [in-depth analysis](https://wandb.ai/psuraj/dreambooth/reports/Dreambooth-Training-Analysis--VmlldzoyNzk0NDc3) with recommended settings for different subjects, and go from there.
+Dreambooth fine-tuning is very sensitive to hyperparameters and easy to overfit. We recommend you take a look at our [in-depth analysis](https://huggingface.co/blog/dreambooth) with recommended settings for different subjects, and go from there.
 
 </Tip>
 
@@ -148,7 +148,7 @@ accelerate launch train_dreambooth.py \
 
 ### Fine-tune the text encoder in addition to the UNet
 
-The script also allows to fine-tune the `text_encoder` along with the `unet`. It has been observed experimentally that this gives much better results, especially on faces. Please, refer to [our report](https://wandb.ai/psuraj/dreambooth/reports/Dreambooth-Training-Analysis--VmlldzoyNzk0NDc3) for more details.
+The script also allows to fine-tune the `text_encoder` along with the `unet`. It has been observed experimentally that this gives much better results, especially on faces. Please, refer to [our blog](https://huggingface.co/blog/dreambooth) for more details.
 
 To enable this option, pass the `--train_text_encoder` argument to the training script.
 

From c3dcb6749b744bbf3ebfd1329de6a50009b849ee Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 8 Nov 2022 11:31:15 +0100
Subject: [PATCH 148/269] Update config.yml

---
 .github/ISSUE_TEMPLATE/config.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index fd8eaa8d13fc..d8cf414a8cb7 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,7 +1,4 @@
 contact_links:
-  - name: Forum
-    url: https://discuss.huggingface.co/c/discussion-related-to-httpsgithubcomhuggingfacediffusers/63
-    about: General usage questions and community discussions
   - name: Blank issue
     url: https://github.com/huggingface/diffusers/issues/new
-    about: Please note that the Forum is in most places the right place for discussions
+    about: General usage questions and community discussions

From 20a05d6a506e10fe82e998c5b2f22fce6a298229 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Tue, 8 Nov 2022 12:30:51 +0100
Subject: [PATCH 149/269] Fix small typo (#1178)

Unless it's intentional, lol
---
 src/diffusers/configuration_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py
index d830857a302c..39762e41d344 100644
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -101,7 +101,7 @@ def save_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool
         output_config_file = os.path.join(save_directory, self.config_name)
 
         self.to_json_file(output_config_file)
-        logger.info(f"ConfigMixinuration saved in {output_config_file}")
+        logger.info(f"Configuration saved in {output_config_file}")
 
     @classmethod
     def from_config(cls, pretrained_model_name_or_path: Union[str, os.PathLike], return_unused_kwargs=False, **kwargs):

From 5a8b3569226c47667113d8a55cfc10f1de0ae67a Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Tue, 8 Nov 2022 13:11:12 +0100
Subject: [PATCH 150/269] [DDIMScheduler] fix noise device in ddim step (#1189)

* fix noise device in ddim sched

* fix typo

* self.device -> device

* remove duplicated if

* use str device

* don't use str for device
---
 src/diffusers/schedulers/scheduling_ddim.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 8d4407c16c30..1acb81764d32 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -288,7 +288,7 @@ def step(
 
         if eta > 0:
             # randn_like does not support generator https://github.com/pytorch/pytorch/issues/27072
-            device = model_output.device if torch.is_tensor(model_output) else "cpu"
+            device = model_output.device if torch.is_tensor(model_output) else torch.device("cpu")
             if variance_noise is not None and generator is not None:
                 raise ValueError(
                     "Cannot pass both generator and variance_noise. Please make sure that either `generator` or"
@@ -296,9 +296,14 @@ def step(
                 )
 
             if variance_noise is None:
-                variance_noise = torch.randn(model_output.shape, dtype=model_output.dtype, generator=generator).to(
-                    device
-                )
+                if device.type == "mps":
+                    # randn does not work reproducibly on mps
+                    variance_noise = torch.randn(model_output.shape, dtype=model_output.dtype, generator=generator)
+                    variance_noise = variance_noise.to(device)
+                else:
+                    variance_noise = torch.randn(
+                        model_output.shape, generator=generator, device=device, dtype=model_output.dtype
+                    )
             variance = self._get_variance(timestep, prev_timestep) ** (0.5) * eta * variance_noise
 
             prev_sample = prev_sample + variance

From 813744e5f3af32a81cf31427940d1a2d3abdf578 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Tue, 8 Nov 2022 13:11:33 +0100
Subject: [PATCH 151/269] MPS schedulers: don't use float64 (#1169)

* Schedulers: don't use float64 on mps

* Test set_timesteps() on device (float schedulers).

* SD pipeline: use device in set_timesteps.

* SD in-painting pipeline: use device in set_timesteps.

* Tests: fix mps crashes.

* Skip test_load_pipeline_from_git on mps.

Not compatible with float16.

* Use device.type instead of str in Euler schedulers.
---
 .../pipeline_stable_diffusion.py              |  9 +-
 .../pipeline_stable_diffusion_inpaint.py      |  9 +-
 .../scheduling_euler_ancestral_discrete.py    | 10 ++-
 .../schedulers/scheduling_euler_discrete.py   | 10 ++-
 .../schedulers/scheduling_lms_discrete.py     |  7 +-
 tests/models/test_models_unet_2d.py           |  3 +
 tests/test_pipelines.py                       |  4 +-
 tests/test_scheduler.py                       | 87 ++++++++++++++++++-
 8 files changed, 117 insertions(+), 22 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 9c7edabf69f1..30be4156f92d 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -360,12 +360,9 @@ def __call__(
                 raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
             latents = latents.to(self.device)
 
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        # Some schedulers like PNDM have timesteps as arrays
-        # It's more optimized to move all timesteps to correct device beforehand
-        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+        # set timesteps and move to the correct device
+        self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+        timesteps_tensor = self.scheduler.timesteps
 
         # scale the initial noise by the standard deviation required by the scheduler
         latents = latents * self.scheduler.init_noise_sigma
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index a7af1c9d3351..83d27926dc5b 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -416,12 +416,9 @@ def __call__(
                 " `pipeline.unet` or your `mask_image` or `image` input."
             )
 
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        # Some schedulers like PNDM have timesteps as arrays
-        # It's more optimized to move all timesteps to correct device beforehand
-        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+        # set timesteps and move to the correct device
+        self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+        timesteps_tensor = self.scheduler.timesteps
 
         # scale the initial noise by the standard deviation required by the scheduler
         latents = latents * self.scheduler.init_noise_sigma
diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
index 7f44067325cf..33505c81c004 100644
--- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -151,7 +151,11 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
         sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
         self.sigmas = torch.from_numpy(sigmas).to(device=device)
-        self.timesteps = torch.from_numpy(timesteps).to(device=device)
+        if str(device).startswith("mps"):
+            # mps does not support float64
+            self.timesteps = torch.from_numpy(timesteps).to(device, dtype=torch.float32)
+        else:
+            self.timesteps = torch.from_numpy(timesteps).to(device=device)
 
     def step(
         self,
@@ -217,8 +221,8 @@ def step(
 
         prev_sample = sample + derivative * dt
 
-        device = model_output.device if torch.is_tensor(model_output) else "cpu"
-        if str(device) == "mps":
+        device = model_output.device if torch.is_tensor(model_output) else torch.device("cpu")
+        if device.type == "mps":
             # randn does not work reproducibly on mps
             noise = torch.randn(model_output.shape, dtype=model_output.dtype, device="cpu", generator=generator).to(
                 device
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 50a1bd89f839..9f707c27a15b 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -152,7 +152,11 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
         sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
         self.sigmas = torch.from_numpy(sigmas).to(device=device)
-        self.timesteps = torch.from_numpy(timesteps).to(device=device)
+        if str(device).startswith("mps"):
+            # mps does not support float64
+            self.timesteps = torch.from_numpy(timesteps).to(device, dtype=torch.float32)
+        else:
+            self.timesteps = torch.from_numpy(timesteps).to(device=device)
 
     def step(
         self,
@@ -214,8 +218,8 @@ def step(
 
         gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
 
-        device = model_output.device if torch.is_tensor(model_output) else "cpu"
-        if str(device) == "mps":
+        device = model_output.device if torch.is_tensor(model_output) else torch.device("cpu")
+        if device.type == "mps":
             # randn does not work reproducibly on mps
             noise = torch.randn(model_output.shape, dtype=model_output.dtype, device="cpu", generator=generator).to(
                 device
diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py
index d636fe6fe87f..373c373ee0fe 100644
--- a/src/diffusers/schedulers/scheduling_lms_discrete.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete.py
@@ -173,8 +173,13 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
         sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
         sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+
         self.sigmas = torch.from_numpy(sigmas).to(device=device)
-        self.timesteps = torch.from_numpy(timesteps).to(device=device)
+        if str(device).startswith("mps"):
+            # mps does not support float64
+            self.timesteps = torch.from_numpy(timesteps).to(device, dtype=torch.float32)
+        else:
+            self.timesteps = torch.from_numpy(timesteps).to(device=device)
 
         self.derivatives = []
 
diff --git a/tests/models/test_models_unet_2d.py b/tests/models/test_models_unet_2d.py
index 71ddf1a13414..81437311c658 100644
--- a/tests/models/test_models_unet_2d.py
+++ b/tests/models/test_models_unet_2d.py
@@ -456,6 +456,7 @@ def get_encoder_hidden_states(self, seed=0, shape=(4, 77, 768), fp16=False):
             # fmt: on
         ]
     )
+    @require_torch_gpu
     def test_compvis_sd_v1_4(self, seed, timestep, expected_slice):
         model = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4")
         latents = self.get_latents(seed)
@@ -507,6 +508,7 @@ def test_compvis_sd_v1_4_fp16(self, seed, timestep, expected_slice):
             # fmt: on
         ]
     )
+    @require_torch_gpu
     def test_compvis_sd_v1_5(self, seed, timestep, expected_slice):
         model = self.get_unet_model(model_id="runwayml/stable-diffusion-v1-5")
         latents = self.get_latents(seed)
@@ -558,6 +560,7 @@ def test_compvis_sd_v1_5_fp16(self, seed, timestep, expected_slice):
             # fmt: on
         ]
     )
+    @require_torch_gpu
     def test_compvis_sd_inpaint(self, seed, timestep, expected_slice):
         model = self.get_unet_model(model_id="runwayml/stable-diffusion-inpainting")
         latents = self.get_latents(seed, shape=(4, 9, 64, 64))
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index b8316075fa93..2b19b08b3742 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -41,7 +41,7 @@
 from diffusers.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from diffusers.utils import CONFIG_NAME, WEIGHTS_NAME, floats_tensor, slow, torch_device
-from diffusers.utils.testing_utils import CaptureLogger, get_tests_dir
+from diffusers.utils.testing_utils import CaptureLogger, get_tests_dir, require_torch_gpu
 from parameterized import parameterized
 from PIL import Image
 from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPTokenizer
@@ -124,7 +124,7 @@ def test_local_custom_pipeline(self):
         assert output_str == "This is a local test"
 
     @slow
-    @unittest.skipIf(torch_device == "cpu", "Stable diffusion is supposed to run on GPU")
+    @require_torch_gpu
     def test_load_pipeline_from_git(self):
         clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
 
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 056f723835ba..70201d1e6774 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -83,8 +83,8 @@ def check_over_configs(self, time_step=0, **config):
 
         num_inference_steps = kwargs.pop("num_inference_steps", None)
 
-        # TODO(Suraj) - delete the following two lines once DDPM, DDIM, and PNDM have timesteps casted to float by default
         for scheduler_class in self.scheduler_classes:
+            # TODO(Suraj) - delete the following two lines once DDPM, DDIM, and PNDM have timesteps casted to float by default
             if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
                 time_step = float(time_step)
 
@@ -1192,6 +1192,31 @@ def test_full_loop_no_noise(self):
         assert abs(result_sum.item() - 1006.388) < 1e-2
         assert abs(result_mean.item() - 1.31) < 1e-3
 
+    def test_full_loop_device(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for i, t in enumerate(scheduler.timesteps):
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+
+        assert abs(result_sum.item() - 1006.388) < 1e-2
+        assert abs(result_mean.item() - 1.31) < 1e-3
+
 
 class EulerDiscreteSchedulerTest(SchedulerCommonTest):
     scheduler_classes = (EulerDiscreteScheduler,)
@@ -1248,6 +1273,34 @@ def test_full_loop_no_noise(self):
         assert abs(result_sum.item() - 10.0807) < 1e-2
         assert abs(result_mean.item() - 0.0131) < 1e-3
 
+    def test_full_loop_device(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+        generator = torch.Generator().manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for t in scheduler.timesteps:
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+        print(result_sum, result_mean)
+
+        assert abs(result_sum.item() - 10.0807) < 1e-2
+        assert abs(result_mean.item() - 0.0131) < 1e-3
+
 
 class EulerAncestralDiscreteSchedulerTest(SchedulerCommonTest):
     scheduler_classes = (EulerAncestralDiscreteScheduler,)
@@ -1303,6 +1356,38 @@ def test_full_loop_no_noise(self):
         assert abs(result_sum.item() - 152.3192) < 1e-2
         assert abs(result_mean.item() - 0.1983) < 1e-3
 
+    def test_full_loop_device(self):
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+        scheduler = scheduler_class(**scheduler_config)
+
+        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
+
+        generator = torch.Generator().manual_seed(0)
+
+        model = self.dummy_model()
+        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
+
+        for t in scheduler.timesteps:
+            sample = scheduler.scale_model_input(sample, t)
+
+            model_output = model(sample, t)
+
+            output = scheduler.step(model_output, t, sample, generator=generator)
+            sample = output.prev_sample
+
+        result_sum = torch.sum(torch.abs(sample))
+        result_mean = torch.mean(torch.abs(sample))
+        print(result_sum, result_mean)
+        if not str(torch_device).startswith("mps"):
+            # The following sum varies between 148 and 156 on mps. Why?
+            assert abs(result_sum.item() - 152.3192) < 1e-2
+            assert abs(result_mean.item() - 0.1983) < 1e-3
+        else:
+            # Larger tolerance on mps
+            assert abs(result_mean.item() - 0.1983) < 1e-2
+
 
 class IPNDMSchedulerTest(SchedulerCommonTest):
     scheduler_classes = (IPNDMScheduler,)

From 555203e1faa32cfa07c6128c09a8352031d7a969 Mon Sep 17 00:00:00 2001
From: Yuta Hayashibe <shirayu@users.noreply.github.com>
Date: Tue, 8 Nov 2022 22:31:13 +0900
Subject: [PATCH 152/269] Warning for invalid options without
 "--with_prior_preservation" (#1065)

* Make errors for invalid options without "--with_prior_preservation"

* Make --instance_prompt required

* Removed needless check because --instance_data_dir is marked with required

* Updated messages

* Use logger.warning instead of raise errors

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 examples/dreambooth/train_dreambooth.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index 9c512ef571d8..610c18533b0b 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -66,6 +66,7 @@ def parse_args(input_args=None):
         "--instance_prompt",
         type=str,
         default=None,
+        required=True,
         help="The prompt with identifier specifying the instance",
     )
     parser.add_argument(
@@ -205,14 +206,16 @@ def parse_args(input_args=None):
     if env_local_rank != -1 and env_local_rank != args.local_rank:
         args.local_rank = env_local_rank
 
-    if args.instance_data_dir is None:
-        raise ValueError("You must specify a train data directory.")
-
     if args.with_prior_preservation:
         if args.class_data_dir is None:
             raise ValueError("You must specify a data directory for class images.")
         if args.class_prompt is None:
             raise ValueError("You must specify prompt for class images.")
+    else:
+        if args.class_data_dir is not None:
+            logger.warning("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            logger.warning("You need not use --class_prompt without --with_prior_preservation.")
 
     return args
 

From 11f7d6f3cc07ed305c162d96bcdddb2ee6802832 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Tue, 8 Nov 2022 14:39:11 +0100
Subject: [PATCH 153/269] [ONNX] Improve ONNXPipeline scheduler compatibility,
 fix safety_checker (#1173)

* [ONNX] Improve ONNX scheduler compatibility, fix safety_checker

* typo
---
 ...ert_stable_diffusion_checkpoint_to_onnx.py |  73 +++++++----
 src/diffusers/onnx_utils.py                   |  28 ++++-
 src/diffusers/pipeline_utils.py               |   2 +-
 .../pipeline_onnx_stable_diffusion.py         |  23 ++--
 .../pipeline_onnx_stable_diffusion_img2img.py |  17 ++-
 .../pipeline_onnx_stable_diffusion_inpaint.py |  18 ++-
 src/diffusers/utils/__init__.py               |   1 +
 .../test_onnx_stable_diffusion.py             | 116 ++++++++++++++++--
 .../test_onnx_stable_diffusion_img2img.py     |  77 +++++++++---
 .../test_onnx_stable_diffusion_inpaint.py     |  80 +++++++++---
 10 files changed, 346 insertions(+), 89 deletions(-)

diff --git a/scripts/convert_stable_diffusion_checkpoint_to_onnx.py b/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
index 8e0b58c56df3..f0e0b178af20 100644
--- a/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
+++ b/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
@@ -81,6 +81,8 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F
     output_path = Path(output_path)
 
     # TEXT ENCODER
+    num_tokens = pipeline.text_encoder.config.max_position_embeddings
+    text_hidden_size = pipeline.text_encoder.config.hidden_size
     text_input = pipeline.tokenizer(
         "A sample prompt",
         padding="max_length",
@@ -103,13 +105,15 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F
     del pipeline.text_encoder
 
     # UNET
+    unet_in_channels = pipeline.unet.config.in_channels
+    unet_sample_size = pipeline.unet.config.sample_size
     unet_path = output_path / "unet" / "model.onnx"
     onnx_export(
         pipeline.unet,
         model_args=(
-            torch.randn(2, pipeline.unet.in_channels, 64, 64).to(device=device, dtype=dtype),
-            torch.LongTensor([0, 1]).to(device=device),
-            torch.randn(2, 77, 768).to(device=device, dtype=dtype),
+            torch.randn(2, unet_in_channels, unet_sample_size, unet_sample_size).to(device=device, dtype=dtype),
+            torch.randn(2).to(device=device, dtype=dtype),
+            torch.randn(2, num_tokens, text_hidden_size).to(device=device, dtype=dtype),
             False,
         ),
         output_path=unet_path,
@@ -142,11 +146,16 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F
 
     # VAE ENCODER
     vae_encoder = pipeline.vae
+    vae_in_channels = vae_encoder.config.in_channels
+    vae_sample_size = vae_encoder.config.sample_size
     # need to get the raw tensor output (sample) from the encoder
     vae_encoder.forward = lambda sample, return_dict: vae_encoder.encode(sample, return_dict)[0].sample()
     onnx_export(
         vae_encoder,
-        model_args=(torch.randn(1, 3, 512, 512).to(device=device, dtype=dtype), False),
+        model_args=(
+            torch.randn(1, vae_in_channels, vae_sample_size, vae_sample_size).to(device=device, dtype=dtype),
+            False,
+        ),
         output_path=output_path / "vae_encoder" / "model.onnx",
         ordered_input_names=["sample", "return_dict"],
         output_names=["latent_sample"],
@@ -158,11 +167,16 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F
 
     # VAE DECODER
     vae_decoder = pipeline.vae
+    vae_latent_channels = vae_decoder.config.latent_channels
+    vae_out_channels = vae_decoder.config.out_channels
     # forward only through the decoder part
     vae_decoder.forward = vae_encoder.decode
     onnx_export(
         vae_decoder,
-        model_args=(torch.randn(1, 4, 64, 64).to(device=device, dtype=dtype), False),
+        model_args=(
+            torch.randn(1, vae_latent_channels, unet_sample_size, unet_sample_size).to(device=device, dtype=dtype),
+            False,
+        ),
         output_path=output_path / "vae_decoder" / "model.onnx",
         ordered_input_names=["latent_sample", "return_dict"],
         output_names=["sample"],
@@ -174,24 +188,35 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F
     del pipeline.vae
 
     # SAFETY CHECKER
-    safety_checker = pipeline.safety_checker
-    safety_checker.forward = safety_checker.forward_onnx
-    onnx_export(
-        pipeline.safety_checker,
-        model_args=(
-            torch.randn(1, 3, 224, 224).to(device=device, dtype=dtype),
-            torch.randn(1, 512, 512, 3).to(device=device, dtype=dtype),
-        ),
-        output_path=output_path / "safety_checker" / "model.onnx",
-        ordered_input_names=["clip_input", "images"],
-        output_names=["out_images", "has_nsfw_concepts"],
-        dynamic_axes={
-            "clip_input": {0: "batch", 1: "channels", 2: "height", 3: "width"},
-            "images": {0: "batch", 1: "height", 2: "width", 3: "channels"},
-        },
-        opset=opset,
-    )
-    del pipeline.safety_checker
+    if pipeline.safety_checker is not None:
+        safety_checker = pipeline.safety_checker
+        clip_num_channels = safety_checker.config.vision_config.num_channels
+        clip_image_size = safety_checker.config.vision_config.image_size
+        safety_checker.forward = safety_checker.forward_onnx
+        onnx_export(
+            pipeline.safety_checker,
+            model_args=(
+                torch.randn(
+                    1,
+                    clip_num_channels,
+                    clip_image_size,
+                    clip_image_size,
+                ).to(device=device, dtype=dtype),
+                torch.randn(1, vae_sample_size, vae_sample_size, vae_out_channels).to(device=device, dtype=dtype),
+            ),
+            output_path=output_path / "safety_checker" / "model.onnx",
+            ordered_input_names=["clip_input", "images"],
+            output_names=["out_images", "has_nsfw_concepts"],
+            dynamic_axes={
+                "clip_input": {0: "batch", 1: "channels", 2: "height", 3: "width"},
+                "images": {0: "batch", 1: "height", 2: "width", 3: "channels"},
+            },
+            opset=opset,
+        )
+        del pipeline.safety_checker
+        safety_checker = OnnxRuntimeModel.from_pretrained(output_path / "safety_checker")
+    else:
+        safety_checker = None
 
     onnx_pipeline = OnnxStableDiffusionPipeline(
         vae_encoder=OnnxRuntimeModel.from_pretrained(output_path / "vae_encoder"),
@@ -200,7 +225,7 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F
         tokenizer=pipeline.tokenizer,
         unet=OnnxRuntimeModel.from_pretrained(output_path / "unet"),
         scheduler=pipeline.scheduler,
-        safety_checker=OnnxRuntimeModel.from_pretrained(output_path / "safety_checker"),
+        safety_checker=safety_checker,
         feature_extractor=pipeline.feature_extractor,
     )
 
diff --git a/src/diffusers/onnx_utils.py b/src/diffusers/onnx_utils.py
index 142174f6e101..b2c533ed741f 100644
--- a/src/diffusers/onnx_utils.py
+++ b/src/diffusers/onnx_utils.py
@@ -24,7 +24,7 @@
 
 from huggingface_hub import hf_hub_download
 
-from .utils import ONNX_WEIGHTS_NAME, is_onnx_available, logging
+from .utils import ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME, is_onnx_available, logging
 
 
 if is_onnx_available():
@@ -33,13 +33,28 @@
 
 logger = logging.get_logger(__name__)
 
+ORT_TO_NP_TYPE = {
+    "tensor(bool)": np.bool_,
+    "tensor(int8)": np.int8,
+    "tensor(uint8)": np.uint8,
+    "tensor(int16)": np.int16,
+    "tensor(uint16)": np.uint16,
+    "tensor(int32)": np.int32,
+    "tensor(uint32)": np.uint32,
+    "tensor(int64)": np.int64,
+    "tensor(uint64)": np.uint64,
+    "tensor(float16)": np.float16,
+    "tensor(float)": np.float32,
+    "tensor(double)": np.float64,
+}
+
 
 class OnnxRuntimeModel:
     def __init__(self, model=None, **kwargs):
         logger.info("`diffusers.OnnxRuntimeModel` is experimental and might change in the future.")
         self.model = model
         self.model_save_dir = kwargs.get("model_save_dir", None)
-        self.latest_model_name = kwargs.get("latest_model_name", "model.onnx")
+        self.latest_model_name = kwargs.get("latest_model_name", ONNX_WEIGHTS_NAME)
 
     def __call__(self, **kwargs):
         inputs = {k: np.array(v) for k, v in kwargs.items()}
@@ -84,6 +99,15 @@ def _save_pretrained(self, save_directory: Union[str, Path], file_name: Optional
         except shutil.SameFileError:
             pass
 
+        # copy external weights (for models >2GB)
+        src_path = self.model_save_dir.joinpath(ONNX_EXTERNAL_WEIGHTS_NAME)
+        if src_path.exists():
+            dst_path = Path(save_directory).joinpath(ONNX_EXTERNAL_WEIGHTS_NAME)
+            try:
+                shutil.copyfile(src_path, dst_path)
+            except shutil.SameFileError:
+                pass
+
     def save_pretrained(
         self,
         save_directory: Union[str, os.PathLike],
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 628e6320129b..4e7aeb8c9157 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -541,7 +541,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             # if the model is in a pipeline module, then we load it from the pipeline
             if name in passed_class_obj:
                 # 1. check that passed_class_obj has correct parent class
-                if not is_pipeline_module:
+                if not is_pipeline_module and passed_class_obj[name] is not None:
                     library = importlib.import_module(library_name)
                     class_obj = getattr(library, class_name)
                     importable_classes = LOADABLE_CLASSES[library_name]
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
index 22f5bf6c432d..0c50e424e249 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
@@ -2,11 +2,12 @@
 from typing import Callable, List, Optional, Union
 
 import numpy as np
+import torch
 
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
-from ...onnx_utils import OnnxRuntimeModel
+from ...onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
 from ...pipeline_utils import DiffusionPipeline
 from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
 from ...utils import deprecate, logging
@@ -186,7 +187,7 @@ def __call__(
         # set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
 
-        latents = latents * self.scheduler.init_noise_sigma
+        latents = latents * np.float(self.scheduler.init_noise_sigma)
 
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
@@ -197,15 +198,20 @@ def __call__(
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
+        timestep_dtype = next(
+            (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
+        )
+        timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
+
         for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
+            latent_model_input = latent_model_input.cpu().numpy()
 
             # predict the noise residual
-            noise_pred = self.unet(
-                sample=latent_model_input, timestep=np.array([t]), encoder_hidden_states=text_embeddings
-            )
+            timestep = np.array([t], dtype=timestep_dtype)
+            noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=text_embeddings)
             noise_pred = noise_pred[0]
 
             # perform guidance
@@ -214,7 +220,7 @@ def __call__(
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+            latents = self.scheduler.step(noise_pred, t, torch.from_numpy(latents), **extra_step_kwargs).prev_sample
             latents = np.array(latents)
 
             # call the callback, if provided
@@ -235,6 +241,9 @@ def __call__(
             safety_checker_input = self.feature_extractor(
                 self.numpy_to_pil(image), return_tensors="np"
             ).pixel_values.astype(image.dtype)
+
+            image, has_nsfw_concepts = self.safety_checker(clip_input=safety_checker_input, images=image)
+
             # There will throw an error if use safety_checker batchsize>1
             images, has_nsfw_concept = [], []
             for i in range(image.shape[0]):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
index 04ecdbecc6d5..f85069b969b7 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -8,7 +8,7 @@
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
-from ...onnx_utils import OnnxRuntimeModel
+from ...onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
 from ...pipeline_utils import DiffusionPipeline
 from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
 from ...utils import deprecate, logging
@@ -338,14 +338,21 @@ def __call__(
         t_start = max(num_inference_steps - init_timestep + offset, 0)
         timesteps = self.scheduler.timesteps[t_start:].numpy()
 
+        timestep_dtype = next(
+            (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
+        )
+        timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
+
         for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
+            latent_model_input = latent_model_input.cpu().numpy()
 
             # predict the noise residual
+            timestep = np.array([t], dtype=timestep_dtype)
             noise_pred = self.unet(
-                sample=latent_model_input, timestep=np.array([t]), encoder_hidden_states=text_embeddings
+                sample=latent_model_input, timestep=timestep, encoder_hidden_states=text_embeddings
             )[0]
 
             # perform guidance
@@ -354,7 +361,7 @@ def __call__(
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+            latents = self.scheduler.step(noise_pred, t, torch.from_numpy(latents), **extra_step_kwargs).prev_sample
             latents = latents.numpy()
 
             # call the callback, if provided
@@ -375,7 +382,7 @@ def __call__(
             safety_checker_input = self.feature_extractor(
                 self.numpy_to_pil(image), return_tensors="np"
             ).pixel_values.astype(image.dtype)
-            # There will throw an error if use safety_checker batchsize>1
+            # safety_checker does not support batched inputs yet
             images, has_nsfw_concept = [], []
             for i in range(image.shape[0]):
                 image_i, has_nsfw_concept_i = self.safety_checker(
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
index 517242921d92..f763291c77f0 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -8,7 +8,7 @@
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
-from ...onnx_utils import OnnxRuntimeModel
+from ...onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
 from ...pipeline_utils import DiffusionPipeline
 from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
 from ...utils import deprecate, logging
@@ -352,7 +352,7 @@ def __call__(
         self.scheduler.set_timesteps(num_inference_steps)
 
         # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
+        latents = latents * np.float(self.scheduler.init_noise_sigma)
 
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
@@ -363,17 +363,23 @@ def __call__(
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
 
+        timestep_dtype = next(
+            (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
+        )
+        timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
+
         for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
             # concat latents, mask, masked_image_latnets in the channel dimension
             latent_model_input = np.concatenate([latent_model_input, mask, masked_image_latents], axis=1)
             latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
-            latent_model_input = latent_model_input.numpy()
+            latent_model_input = latent_model_input.cpu().numpy()
 
             # predict the noise residual
+            timestep = np.array([t], dtype=timestep_dtype)
             noise_pred = self.unet(
-                sample=latent_model_input, timestep=np.array([t]), encoder_hidden_states=text_embeddings
+                sample=latent_model_input, timestep=timestep, encoder_hidden_states=text_embeddings
             )[0]
 
             # perform guidance
@@ -382,7 +388,7 @@ def __call__(
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+            latents = self.scheduler.step(noise_pred, t, torch.from_numpy(latents), **extra_step_kwargs).prev_sample
             latents = latents.numpy()
 
             # call the callback, if provided
@@ -403,7 +409,7 @@ def __call__(
             safety_checker_input = self.feature_extractor(
                 self.numpy_to_pil(image), return_tensors="np"
             ).pixel_values.astype(image.dtype)
-            # There will throw an error if use safety_checker batchsize>1
+            # safety_checker does not support batched inputs yet
             images, has_nsfw_concept = [], []
             for i in range(image.shape[0]):
                 image_i, has_nsfw_concept_i = self.safety_checker(
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index 3fa477e7dce8..a00e1f4dcd4c 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -67,6 +67,7 @@
 WEIGHTS_NAME = "diffusion_pytorch_model.bin"
 FLAX_WEIGHTS_NAME = "diffusion_flax_model.msgpack"
 ONNX_WEIGHTS_NAME = "model.onnx"
+ONNX_EXTERNAL_WEIGHTS_NAME = "weights.pb"
 HUGGINGFACE_CO_RESOLVE_ENDPOINT = "https://huggingface.co"
 DIFFUSERS_CACHE = default_cache_path
 DIFFUSERS_DYNAMIC_MODULE_NAME = "diffusers_modules"
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
index d8356675e9b3..a1946e39f912 100644
--- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
@@ -13,11 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import tempfile
 import unittest
 
 import numpy as np
 
-from diffusers import OnnxStableDiffusionPipeline
+from diffusers import DDIMScheduler, LMSDiscreteScheduler, OnnxStableDiffusionPipeline
 from diffusers.utils.testing_utils import is_onnx_available, require_onnxruntime, require_torch_gpu, slow
 
 from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
@@ -36,32 +37,87 @@ class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.Tes
 @require_onnxruntime
 @require_torch_gpu
 class OnnxStableDiffusionPipelineIntegrationTests(unittest.TestCase):
-    def test_inference(self):
-        provider = (
+    @property
+    def gpu_provider(self):
+        return (
             "CUDAExecutionProvider",
             {
-                "gpu_mem_limit": "17179869184",  # 16GB.
+                "gpu_mem_limit": "15000000000",  # 15GB
                 "arena_extend_strategy": "kSameAsRequested",
             },
         )
+
+    @property
+    def gpu_options(self):
         options = ort.SessionOptions()
         options.enable_mem_pattern = False
+        return options
+
+    def test_inference_default_pndm(self):
+        # using the PNDM scheduler by default
         sd_pipe = OnnxStableDiffusionPipeline.from_pretrained(
             "CompVis/stable-diffusion-v1-4",
             revision="onnx",
-            provider=provider,
-            sess_options=options,
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
         )
+        sd_pipe.set_progress_bar_config(disable=None)
 
         prompt = "A painting of a squirrel eating a burger"
         np.random.seed(0)
-        output = sd_pipe([prompt], guidance_scale=6.0, num_inference_steps=5, output_type="np")
+        output = sd_pipe([prompt], guidance_scale=6.0, num_inference_steps=10, output_type="np")
         image = output.images
 
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.3602, 0.3688, 0.3652, 0.3895, 0.3782, 0.3747, 0.3927, 0.4241, 0.4327])
+        expected_slice = np.array([0.0452, 0.0390, 0.0087, 0.0350, 0.0617, 0.0364, 0.0544, 0.0523, 0.0720])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_inference_ddim(self):
+        ddim_scheduler = DDIMScheduler.from_config(
+            "runwayml/stable-diffusion-v1-5", subfolder="scheduler", revision="onnx"
+        )
+        sd_pipe = OnnxStableDiffusionPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5",
+            revision="onnx",
+            scheduler=ddim_scheduler,
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
+        )
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "open neural network exchange"
+        generator = np.random.RandomState(0)
+        output = sd_pipe([prompt], guidance_scale=7.5, num_inference_steps=10, generator=generator, output_type="np")
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.2867, 0.1974, 0.1481, 0.7294, 0.7251, 0.6667, 0.4194, 0.5642, 0.6486])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_inference_k_lms(self):
+        lms_scheduler = LMSDiscreteScheduler.from_config(
+            "runwayml/stable-diffusion-v1-5", subfolder="scheduler", revision="onnx"
+        )
+        sd_pipe = OnnxStableDiffusionPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5",
+            revision="onnx",
+            scheduler=lms_scheduler,
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
+        )
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "open neural network exchange"
+        generator = np.random.RandomState(0)
+        output = sd_pipe([prompt], guidance_scale=7.5, num_inference_steps=10, generator=generator, output_type="np")
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.2306, 0.1959, 0.1593, 0.6549, 0.6394, 0.5408, 0.5065, 0.6010, 0.6161])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
     def test_intermediate_state(self):
@@ -75,27 +131,61 @@ def test_callback_fn(step: int, timestep: int, latents: np.ndarray) -> None:
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
                 expected_slice = np.array(
-                    [-0.5950, -0.3039, -1.1672, 0.1594, -1.1572, 0.6719, -1.9712, -0.0403, 0.9592]
+                    [-0.6772, -0.3835, -1.2456, 0.1905, -1.0974, 0.6967, -1.9353, 0.0178, 1.0167]
                 )
                 assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
             elif step == 5:
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
                 expected_slice = np.array(
-                    [-0.4776, -0.0119, -0.8519, -0.0275, -0.9764, 0.9820, -0.3843, 0.3788, 1.2264]
+                    [-0.3351, 0.2241, -0.1837, -0.2325, -0.6577, 0.3393, -0.0241, 0.5899, 1.3875]
                 )
                 assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
 
         test_callback_fn.has_been_called = False
 
         pipe = OnnxStableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="onnx", provider="CUDAExecutionProvider"
+            "runwayml/stable-diffusion-v1-5",
+            revision="onnx",
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
         )
         pipe.set_progress_bar_config(disable=None)
 
         prompt = "Andromeda galaxy in a bottle"
 
-        np.random.seed(0)
-        pipe(prompt=prompt, num_inference_steps=5, guidance_scale=7.5, callback=test_callback_fn, callback_steps=1)
+        generator = np.random.RandomState(0)
+        pipe(
+            prompt=prompt,
+            num_inference_steps=5,
+            guidance_scale=7.5,
+            generator=generator,
+            callback=test_callback_fn,
+            callback_steps=1,
+        )
         assert test_callback_fn.has_been_called
         assert number_of_steps == 6
+
+    def test_stable_diffusion_no_safety_checker(self):
+        pipe = OnnxStableDiffusionPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5",
+            revision="onnx",
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
+            safety_checker=None,
+        )
+        assert isinstance(pipe, OnnxStableDiffusionPipeline)
+        assert pipe.safety_checker is None
+
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None
+
+        # check that there's no error when saving a pipeline with one of the models being None
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe = OnnxStableDiffusionPipeline.from_pretrained(tmpdirname)
+
+        # sanity check that the pipeline still works
+        assert pipe.safety_checker is None
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
index 3ffbfc3d4f18..61831c64c0ae 100644
--- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 
-from diffusers import OnnxStableDiffusionImg2ImgPipeline
+from diffusers import LMSDiscreteScheduler, OnnxStableDiffusionImg2ImgPipeline
 from diffusers.utils.testing_utils import is_onnx_available, load_image, require_onnxruntime, require_torch_gpu, slow
 
 from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
@@ -35,45 +35,92 @@ class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.Tes
 @slow
 @require_onnxruntime
 @require_torch_gpu
-class OnnxStableDiffusionPipelineIntegrationTests(unittest.TestCase):
-    def test_inference(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/sketch-mountains-input.jpg"
-        )
-        init_image = init_image.resize((768, 512))
-        provider = (
+class OnnxStableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
+    @property
+    def gpu_provider(self):
+        return (
             "CUDAExecutionProvider",
             {
-                "gpu_mem_limit": "17179869184",  # 16GB.
+                "gpu_mem_limit": "15000000000",  # 15GB
                 "arena_extend_strategy": "kSameAsRequested",
             },
         )
+
+    @property
+    def gpu_options(self):
         options = ort.SessionOptions()
         options.enable_mem_pattern = False
+        return options
+
+    def test_inference_default_pndm(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        )
+        init_image = init_image.resize((768, 512))
+        # using the PNDM scheduler by default
         pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(
             "CompVis/stable-diffusion-v1-4",
             revision="onnx",
-            provider=provider,
-            sess_options=options,
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A fantasy landscape, trending on artstation"
+
+        generator = np.random.RandomState(0)
+        output = pipe(
+            prompt=prompt,
+            init_image=init_image,
+            strength=0.75,
+            guidance_scale=7.5,
+            num_inference_steps=10,
+            generator=generator,
+            output_type="np",
+        )
+        images = output.images
+        image_slice = images[0, 255:258, 383:386, -1]
+
+        assert images.shape == (1, 512, 768, 3)
+        expected_slice = np.array([0.4909, 0.5059, 0.5372, 0.4623, 0.4876, 0.5049, 0.4820, 0.4956, 0.5019])
+        # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2
+
+    def test_inference_k_lms(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        )
+        init_image = init_image.resize((768, 512))
+        lms_scheduler = LMSDiscreteScheduler.from_config(
+            "runwayml/stable-diffusion-v1-5", subfolder="scheduler", revision="onnx"
+        )
+        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5",
+            revision="onnx",
+            scheduler=lms_scheduler,
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
         )
         pipe.set_progress_bar_config(disable=None)
 
         prompt = "A fantasy landscape, trending on artstation"
 
-        np.random.seed(0)
+        generator = np.random.RandomState(0)
         output = pipe(
             prompt=prompt,
             init_image=init_image,
             strength=0.75,
             guidance_scale=7.5,
-            num_inference_steps=8,
+            num_inference_steps=10,
+            generator=generator,
             output_type="np",
         )
         images = output.images
         image_slice = images[0, 255:258, 383:386, -1]
 
         assert images.shape == (1, 512, 768, 3)
-        expected_slice = np.array([0.4830, 0.5242, 0.5603, 0.5016, 0.5131, 0.5111, 0.4928, 0.5025, 0.5055])
+        expected_slice = np.array([0.7950, 0.7923, 0.7903, 0.5516, 0.5501, 0.5476, 0.4965, 0.4933, 0.4910])
         # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues
         assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
index 81cbed4e510d..4ba8e273b497 100644
--- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 
-from diffusers import OnnxStableDiffusionInpaintPipeline
+from diffusers import LMSDiscreteScheduler, OnnxStableDiffusionInpaintPipeline
 from diffusers.utils.testing_utils import is_onnx_available, load_image, require_onnxruntime, require_torch_gpu, slow
 
 from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
@@ -35,8 +35,24 @@ class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.Tes
 @slow
 @require_onnxruntime
 @require_torch_gpu
-class OnnxStableDiffusionPipelineIntegrationTests(unittest.TestCase):
-    def test_stable_diffusion_inpaint_onnx(self):
+class OnnxStableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase):
+    @property
+    def gpu_provider(self):
+        return (
+            "CUDAExecutionProvider",
+            {
+                "gpu_mem_limit": "15000000000",  # 15GB
+                "arena_extend_strategy": "kSameAsRequested",
+            },
+        )
+
+    @property
+    def gpu_options(self):
+        options = ort.SessionOptions()
+        options.enable_mem_pattern = False
+        return options
+
+    def test_inference_default_pndm(self):
         init_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/in_paint/overture-creations-5sI6fQgYIuo.png"
@@ -45,37 +61,69 @@ def test_stable_diffusion_inpaint_onnx(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
         )
-        provider = (
-            "CUDAExecutionProvider",
-            {
-                "gpu_mem_limit": "17179869184",  # 16GB.
-                "arena_extend_strategy": "kSameAsRequested",
-            },
+        pipe = OnnxStableDiffusionInpaintPipeline.from_pretrained(
+            "runwayml/stable-diffusion-inpainting",
+            revision="onnx",
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A red cat sitting on a park bench"
+
+        generator = np.random.RandomState(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            mask_image=mask_image,
+            guidance_scale=7.5,
+            num_inference_steps=10,
+            generator=generator,
+            output_type="np",
+        )
+        images = output.images
+        image_slice = images[0, 255:258, 255:258, -1]
+
+        assert images.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.2514, 0.3007, 0.3517, 0.1790, 0.2382, 0.3167, 0.1944, 0.2273, 0.2464])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_inference_k_lms(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
+        )
+        lms_scheduler = LMSDiscreteScheduler.from_config(
+            "runwayml/stable-diffusion-inpainting", subfolder="scheduler", revision="onnx"
         )
-        options = ort.SessionOptions()
-        options.enable_mem_pattern = False
         pipe = OnnxStableDiffusionInpaintPipeline.from_pretrained(
             "runwayml/stable-diffusion-inpainting",
             revision="onnx",
-            provider=provider,
-            sess_options=options,
+            scheduler=lms_scheduler,
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
         )
         pipe.set_progress_bar_config(disable=None)
 
         prompt = "A red cat sitting on a park bench"
 
-        np.random.seed(0)
+        generator = np.random.RandomState(0)
         output = pipe(
             prompt=prompt,
             image=init_image,
             mask_image=mask_image,
             guidance_scale=7.5,
-            num_inference_steps=8,
+            num_inference_steps=10,
+            generator=generator,
             output_type="np",
         )
         images = output.images
         image_slice = images[0, 255:258, 255:258, -1]
 
         assert images.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.2951, 0.2955, 0.2922, 0.2036, 0.1977, 0.2279, 0.1716, 0.1641, 0.1799])
+        expected_slice = np.array([0.2520, 0.2743, 0.2643, 0.2641, 0.2517, 0.2650, 0.2498, 0.2688, 0.2529])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3

From 614c182f94279d50d584d1a60f2745748ec598b7 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Tue, 8 Nov 2022 15:08:35 +0100
Subject: [PATCH 154/269] Restore compatibility with deprecated
 `StableDiffusionOnnxPipeline` (#1191)

* Restore compatibility with old ONNX pipeline.

I think it broke in #552.

* Add missing attribute `vae_encoder`
---
 .../stable_diffusion/pipeline_onnx_stable_diffusion.py         | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
index 0c50e424e249..ecc0b31f3456 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
@@ -18,6 +18,7 @@
 
 
 class OnnxStableDiffusionPipeline(DiffusionPipeline):
+    vae_encoder: OnnxRuntimeModel
     vae_decoder: OnnxRuntimeModel
     text_encoder: OnnxRuntimeModel
     tokenizer: CLIPTokenizer
@@ -268,6 +269,7 @@ def __call__(
 class StableDiffusionOnnxPipeline(OnnxStableDiffusionPipeline):
     def __init__(
         self,
+        vae_encoder: OnnxRuntimeModel,
         vae_decoder: OnnxRuntimeModel,
         text_encoder: OnnxRuntimeModel,
         tokenizer: CLIPTokenizer,
@@ -279,6 +281,7 @@ def __init__(
         deprecation_message = "Please use `OnnxStableDiffusionPipeline` instead of `StableDiffusionOnnxPipeline`."
         deprecate("StableDiffusionOnnxPipeline", "1.0.0", deprecation_message)
         super().__init__(
+            vae_encoder=vae_encoder,
             vae_decoder=vae_decoder,
             text_encoder=text_encoder,
             tokenizer=tokenizer,

From 32b0736d8ad7ec124affca3a00a266f5addcbd91 Mon Sep 17 00:00:00 2001
From: Mishig <mishig.davaadorj@coloradocollege.edu>
Date: Tue, 8 Nov 2022 16:38:09 +0100
Subject: [PATCH 155/269] Update pr docs actions (#1194)

---
 .github/workflows/build_pr_documentation.yml | 5 ++++-
 .github/workflows/delete_doc_comment.yml     | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index d51623e735c5..542920d7f6ed 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -9,8 +9,11 @@ concurrency:
 
 jobs:
   build:
-    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
+    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@use_hf_hub
     with:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
       package: diffusers
+    secrets:
+      token: ${{ secrets.HF_DOC_PUSH }}
+      comment_bot_token: ${{ secrets.HUGGINGFACE_PUSH }}
diff --git a/.github/workflows/delete_doc_comment.yml b/.github/workflows/delete_doc_comment.yml
index 238dc0bdbabf..e1b2da9567ed 100644
--- a/.github/workflows/delete_doc_comment.yml
+++ b/.github/workflows/delete_doc_comment.yml
@@ -7,7 +7,10 @@ on:
 
 jobs:
   delete:
-    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main
+    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@use_hf_hub
     with:
       pr_number: ${{ github.event.number }}
       package: diffusers
+    secrets:
+      token: ${{ secrets.HF_DOC_PUSH }}
+      comment_bot_token: ${{ secrets.HUGGINGFACE_PUSH }}

From 5786b0e2f7a76229781e40671abee5000a315e88 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Tue, 8 Nov 2022 17:15:23 +0100
Subject: [PATCH 156/269] handle dtype xformers attention  (#1196)

handle dtype xformers
---
 src/diffusers/models/attention.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index bac85e2f39cf..e8ea37970e04 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -492,6 +492,8 @@ def forward(self, hidden_states, context=None, mask=None):
         # attention, what we cannot get enough of
         if self._use_memory_efficient_attention_xformers:
             hidden_states = self._memory_efficient_attention_xformers(query, key, value)
+            # Some versions of xformers return output in fp32, cast it back to the dtype of the input
+            hidden_states = hidden_states.to(query.dtype)
         else:
             if self._slice_size is None or query.shape[0] // self._slice_size == 1:
                 hidden_states = self._attention(query, key, value)

From 249d9bc0e76e55eb16c08d0e70b2d6057259c4a0 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 8 Nov 2022 18:08:08 +0100
Subject: [PATCH 157/269] [Scheduler] Move predict epsilon to init (#1155)

* [Scheduler] Move predict epsilon to init

* up

* uP

* uP

* Apply suggestions from code review

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

* up

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 .../train_unconditional.py                    | 26 ++++---
 src/diffusers/configuration_utils.py          |  5 ++
 src/diffusers/pipelines/ddpm/pipeline_ddpm.py | 13 +++-
 src/diffusers/schedulers/scheduling_ddpm.py   | 23 ++++--
 .../schedulers/scheduling_ddpm_flax.py        | 22 ++++--
 tests/fixtures/custom_pipeline/pipeline.py    |  3 +-
 tests/pipelines/ddpm/test_ddpm.py             | 71 ++++++++++++++++++-
 tests/test_config.py                          | 24 +++++++
 tests/test_pipelines.py                       |  1 +
 tests/test_scheduler.py                       | 30 +++++++-
 10 files changed, 193 insertions(+), 25 deletions(-)
 mode change 100755 => 100644 tests/test_config.py

diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py
index 3f9ffb11ef45..0eadecbd3095 100644
--- a/examples/unconditional_image_generation/train_unconditional.py
+++ b/examples/unconditional_image_generation/train_unconditional.py
@@ -1,4 +1,5 @@
 import argparse
+import inspect
 import math
 import os
 from pathlib import Path
@@ -190,10 +191,10 @@ def parse_args():
     )
 
     parser.add_argument(
-        "--predict_mode",
-        type=str,
-        default="eps",
-        help="What the model should predict. 'eps' to predict error, 'x0' to directly predict reconstruction",
+        "--predict_epsilon",
+        action="store_true",
+        default=True,
+        help="Whether the model should predict the 'epsilon'/noise error or directly the reconstructed image 'x0'.",
     )
 
     parser.add_argument("--ddpm_num_steps", type=int, default=1000)
@@ -252,7 +253,17 @@ def main(args):
             "UpBlock2D",
         ),
     )
-    noise_scheduler = DDPMScheduler(num_train_timesteps=args.ddpm_num_steps, beta_schedule=args.ddpm_beta_schedule)
+    accepts_predict_epsilon = "predict_epsilon" in set(inspect.signature(DDPMScheduler.__init__).parameters.keys())
+
+    if accepts_predict_epsilon:
+        noise_scheduler = DDPMScheduler(
+            num_train_timesteps=args.ddpm_num_steps,
+            beta_schedule=args.ddpm_beta_schedule,
+            predict_epsilon=args.predict_epsilon,
+        )
+    else:
+        noise_scheduler = DDPMScheduler(num_train_timesteps=args.ddpm_num_steps, beta_schedule=args.ddpm_beta_schedule)
+
     optimizer = torch.optim.AdamW(
         model.parameters(),
         lr=args.learning_rate,
@@ -351,9 +362,9 @@ def transforms(examples):
                 # Predict the noise residual
                 model_output = model(noisy_images, timesteps).sample
 
-                if args.predict_mode == "eps":
+                if args.predict_epsilon:
                     loss = F.mse_loss(model_output, noise)  # this could have different weights!
-                elif args.predict_mode == "x0":
+                else:
                     alpha_t = _extract_into_tensor(
                         noise_scheduler.alphas_cumprod, timesteps, (clean_images.shape[0], 1, 1, 1)
                     )
@@ -401,7 +412,6 @@ def transforms(examples):
                     generator=generator,
                     batch_size=args.eval_batch_size,
                     output_type="numpy",
-                    predict_epsilon=args.predict_mode == "eps",
                 ).images
 
                 # denormalize the images and save to tensorboard
diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py
index 39762e41d344..fc6ac9b5b97e 100644
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -334,6 +334,11 @@ def extract_init_dict(cls, config_dict, **kwargs):
         # 3. Create keyword arguments that will be passed to __init__ from expected keyword arguments
         init_dict = {}
         for key in expected_keys:
+            # if config param is passed to kwarg and is present in config dict
+            # it should overwrite existing config dict key
+            if key in kwargs and key in config_dict:
+                config_dict[key] = kwargs.pop(key)
+
             if key in kwargs:
                 # overwrite key
                 init_dict[key] = kwargs.pop(key)
diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
index 811614ecbdde..37d12f2f5df6 100644
--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -18,7 +18,9 @@
 
 import torch
 
+from ...configuration_utils import FrozenDict
 from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ...utils import deprecate
 
 
 class DDPMPipeline(DiffusionPipeline):
@@ -45,7 +47,6 @@ def __call__(
         num_inference_steps: int = 1000,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
-        predict_epsilon: bool = True,
         **kwargs,
     ) -> Union[ImagePipelineOutput, Tuple]:
         r"""
@@ -69,6 +70,16 @@ def __call__(
             `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
             generated images.
         """
+        message = (
+            "Please make sure to instantiate your scheduler with `predict_epsilon` instead. E.g. `scheduler ="
+            " DDPMScheduler.from_config(<model_id>, predict_epsilon=True)`."
+        )
+        predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
+
+        if predict_epsilon is not None:
+            new_config = dict(self.scheduler.config)
+            new_config["predict_epsilon"] = predict_epsilon
+            self.scheduler._internal_dict = FrozenDict(new_config)
 
         # Sample gaussian noise to begin loop
         image = torch.randn(
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index 171c9598eba2..08a73119e5b7 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -21,8 +21,8 @@
 import numpy as np
 import torch
 
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput
+from ..configuration_utils import ConfigMixin, FrozenDict, register_to_config
+from ..utils import BaseOutput, deprecate
 from .scheduling_utils import SchedulerMixin
 
 
@@ -99,6 +99,8 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
             `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
         clip_sample (`bool`, default `True`):
             option to clip predicted sample between -1 and 1 for numerical stability.
+        predict_epsilon (`bool`):
+            optional flag to use when the model predicts the noise (epsilon), or the samples instead of the noise.
 
     """
 
@@ -121,6 +123,7 @@ def __init__(
         trained_betas: Optional[np.ndarray] = None,
         variance_type: str = "fixed_small",
         clip_sample: bool = True,
+        predict_epsilon: bool = True,
     ):
         if trained_betas is not None:
             self.betas = torch.from_numpy(trained_betas)
@@ -221,9 +224,9 @@ def step(
         model_output: torch.FloatTensor,
         timestep: int,
         sample: torch.FloatTensor,
-        predict_epsilon=True,
         generator=None,
         return_dict: bool = True,
+        **kwargs,
     ) -> Union[DDPMSchedulerOutput, Tuple]:
         """
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
@@ -234,8 +237,6 @@ def step(
             timestep (`int`): current discrete timestep in the diffusion chain.
             sample (`torch.FloatTensor`):
                 current instance of sample being created by diffusion process.
-            predict_epsilon (`bool`):
-                optional flag to use when model predicts the samples directly instead of the noise, epsilon.
             generator: random number generator.
             return_dict (`bool`): option for returning tuple rather than DDPMSchedulerOutput class
 
@@ -245,6 +246,16 @@ def step(
             returning a tuple, the first element is the sample tensor.
 
         """
+        message = (
+            "Please make sure to instantiate your scheduler with `predict_epsilon` instead. E.g. `scheduler ="
+            " DDPMScheduler.from_config(<model_id>, predict_epsilon=True)`."
+        )
+        predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
+        if predict_epsilon is not None and predict_epsilon != self.config.predict_epsilon:
+            new_config = dict(self.config)
+            new_config["predict_epsilon"] = predict_epsilon
+            self._internal_dict = FrozenDict(new_config)
+
         t = timestep
 
         if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
@@ -260,7 +271,7 @@ def step(
 
         # 2. compute predicted original sample from predicted noise also called
         # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
-        if predict_epsilon:
+        if self.config.predict_epsilon:
             pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
         else:
             pred_original_sample = model_output
diff --git a/src/diffusers/schedulers/scheduling_ddpm_flax.py b/src/diffusers/schedulers/scheduling_ddpm_flax.py
index 7220a0145471..f1b04a04176e 100644
--- a/src/diffusers/schedulers/scheduling_ddpm_flax.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_flax.py
@@ -22,7 +22,8 @@
 import jax.numpy as jnp
 from jax import random
 
-from ..configuration_utils import ConfigMixin, register_to_config
+from ..configuration_utils import ConfigMixin, FrozenDict, register_to_config
+from ..utils import deprecate
 from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
 
 
@@ -97,7 +98,8 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
             `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
         clip_sample (`bool`, default `True`):
             option to clip predicted sample between -1 and 1 for numerical stability.
-        tensor_format (`str`): whether the scheduler expects pytorch or numpy arrays.
+        predict_epsilon (`bool`):
+            optional flag to use when the model predicts the noise (epsilon), or the samples instead of the noise.
 
     """
 
@@ -115,6 +117,7 @@ def __init__(
         trained_betas: Optional[jnp.ndarray] = None,
         variance_type: str = "fixed_small",
         clip_sample: bool = True,
+        predict_epsilon: bool = True,
     ):
         if trained_betas is not None:
             self.betas = jnp.asarray(trained_betas)
@@ -196,6 +199,7 @@ def step(
         key: random.KeyArray,
         predict_epsilon: bool = True,
         return_dict: bool = True,
+        **kwargs,
     ) -> Union[FlaxDDPMSchedulerOutput, Tuple]:
         """
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
@@ -208,8 +212,6 @@ def step(
             sample (`jnp.ndarray`):
                 current instance of sample being created by diffusion process.
             key (`random.KeyArray`): a PRNG key.
-            predict_epsilon (`bool`):
-                optional flag to use when model predicts the samples directly instead of the noise, epsilon.
             return_dict (`bool`): option for returning tuple rather than FlaxDDPMSchedulerOutput class
 
         Returns:
@@ -217,6 +219,16 @@ def step(
             `tuple`. When returning a tuple, the first element is the sample tensor.
 
         """
+        message = (
+            "Please make sure to instantiate your scheduler with `predict_epsilon` instead. E.g. `scheduler ="
+            " DDPMScheduler.from_config(<model_id>, predict_epsilon=True)`."
+        )
+        predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
+        if predict_epsilon is not None and predict_epsilon != self.config.predict_epsilon:
+            new_config = dict(self.config)
+            new_config["predict_epsilon"] = predict_epsilon
+            self._internal_dict = FrozenDict(new_config)
+
         t = timestep
 
         if model_output.shape[1] == sample.shape[1] * 2 and self.config.variance_type in ["learned", "learned_range"]:
@@ -232,7 +244,7 @@ def step(
 
         # 2. compute predicted original sample from predicted noise also called
         # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
-        if predict_epsilon:
+        if self.config.predict_epsilon:
             pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
         else:
             pred_original_sample = model_output
diff --git a/tests/fixtures/custom_pipeline/pipeline.py b/tests/fixtures/custom_pipeline/pipeline.py
index 10a22edaa490..e7429d0a1945 100644
--- a/tests/fixtures/custom_pipeline/pipeline.py
+++ b/tests/fixtures/custom_pipeline/pipeline.py
@@ -42,7 +42,6 @@ def __call__(
         self,
         batch_size: int = 1,
         generator: Optional[torch.Generator] = None,
-        eta: float = 0.0,
         num_inference_steps: int = 50,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
@@ -89,7 +88,7 @@ def __call__(
             # 2. predict previous mean of image x_t-1 and add variance depending on eta
             # eta corresponds to η in paper and should be between [0, 1]
             # do x_t -> x_t-1
-            image = self.scheduler.step(model_output, t, image, eta).prev_sample
+            image = self.scheduler.step(model_output, t, image).prev_sample
 
         image = (image / 2 + 0.5).clamp(0, 1)
         image = image.cpu().permute(0, 2, 3, 1).numpy()
diff --git a/tests/pipelines/ddpm/test_ddpm.py b/tests/pipelines/ddpm/test_ddpm.py
index c58e2db38f21..a09f77d12467 100644
--- a/tests/pipelines/ddpm/test_ddpm.py
+++ b/tests/pipelines/ddpm/test_ddpm.py
@@ -19,6 +19,7 @@
 import torch
 
 from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
+from diffusers.utils import deprecate
 from diffusers.utils.testing_utils import require_torch, slow, torch_device
 
 from ...test_pipelines_common import PipelineTesterMixin
@@ -28,8 +29,74 @@
 
 
 class DDPMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    # FIXME: add fast tests
-    pass
+    @property
+    def dummy_uncond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
+        return model
+
+    def test_inference(self):
+        unet = self.dummy_uncond_unet
+        scheduler = DDPMScheduler()
+
+        ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
+        ddpm.to(torch_device)
+        ddpm.set_progress_bar_config(disable=None)
+
+        # Warmup pass when using mps (see #372)
+        if torch_device == "mps":
+            _ = ddpm(num_inference_steps=1)
+
+        generator = torch.manual_seed(0)
+        image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images
+
+        generator = torch.manual_seed(0)
+        image_from_tuple = ddpm(generator=generator, num_inference_steps=2, output_type="numpy", return_dict=False)[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array(
+            [5.589e-01, 7.089e-01, 2.632e-01, 6.841e-01, 1.000e-04, 9.999e-01, 1.973e-01, 1.000e-04, 8.010e-02]
+        )
+        tolerance = 1e-2 if torch_device != "mps" else 3e-2
+        assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < tolerance
+
+    def test_inference_predict_epsilon(self):
+        deprecate("remove this test", "0.10.0", "remove")
+        unet = self.dummy_uncond_unet
+        scheduler = DDPMScheduler(predict_epsilon=False)
+
+        ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
+        ddpm.to(torch_device)
+        ddpm.set_progress_bar_config(disable=None)
+
+        # Warmup pass when using mps (see #372)
+        if torch_device == "mps":
+            _ = ddpm(num_inference_steps=1)
+
+        generator = torch.manual_seed(0)
+        image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images
+
+        generator = torch.manual_seed(0)
+        image_eps = ddpm(generator=generator, num_inference_steps=2, output_type="numpy", predict_epsilon=False)[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_eps_slice = image_eps[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        tolerance = 1e-2 if torch_device != "mps" else 3e-2
+        assert np.abs(image_slice.flatten() - image_eps_slice.flatten()).max() < tolerance
 
 
 @slow
diff --git a/tests/test_config.py b/tests/test_config.py
old mode 100755
new mode 100644
index 5084769def41..8ae8e1d9e173
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -21,6 +21,7 @@
 import diffusers
 from diffusers import (
     DDIMScheduler,
+    DDPMScheduler,
     DPMSolverMultistepScheduler,
     EulerAncestralDiscreteScheduler,
     EulerDiscreteScheduler,
@@ -291,6 +292,29 @@ def test_load_pndm(self):
         # no warning should be thrown
         assert cap_logger.out == ""
 
+    def test_overwrite_config_on_load(self):
+        logger = logging.get_logger("diffusers.configuration_utils")
+
+        with CaptureLogger(logger) as cap_logger:
+            ddpm = DDPMScheduler.from_config(
+                "hf-internal-testing/tiny-stable-diffusion-torch",
+                subfolder="scheduler",
+                predict_epsilon=False,
+                beta_end=8,
+            )
+
+        with CaptureLogger(logger) as cap_logger_2:
+            ddpm_2 = DDPMScheduler.from_config("google/ddpm-celebahq-256", beta_start=88)
+
+        assert ddpm.__class__ == DDPMScheduler
+        assert ddpm.config.predict_epsilon is False
+        assert ddpm.config.beta_end == 8
+        assert ddpm_2.config.beta_start == 88
+
+        # no warning should be thrown
+        assert cap_logger.out == ""
+        assert cap_logger_2.out == ""
+
     def test_load_dpmsolver(self):
         logger = logging.get_logger("diffusers.configuration_utils")
 
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 2b19b08b3742..db3e804e674c 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -107,6 +107,7 @@ def test_run_custom_pipeline(self):
         images, output_str = pipeline(num_inference_steps=2, output_type="np")
 
         assert images[0].shape == (1, 32, 32, 3)
+
         # compare output to https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py#L102
         assert output_str == "This is a test"
 
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 70201d1e6774..234e1185b439 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -33,7 +33,7 @@
     ScoreSdeVeScheduler,
     VQDiffusionScheduler,
 )
-from diffusers.utils import torch_device
+from diffusers.utils import deprecate, torch_device
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
@@ -393,6 +393,34 @@ def test_clip_sample(self):
         for clip_sample in [True, False]:
             self.check_over_configs(clip_sample=clip_sample)
 
+    def test_predict_epsilon(self):
+        for predict_epsilon in [True, False]:
+            self.check_over_configs(predict_epsilon=predict_epsilon)
+
+    def test_deprecated_epsilon(self):
+        deprecate("remove this test", "0.10.0", "remove")
+        scheduler_class = self.scheduler_classes[0]
+        scheduler_config = self.get_scheduler_config()
+
+        sample = self.dummy_sample_deter
+        residual = 0.1 * self.dummy_sample_deter
+        time_step = 4
+
+        scheduler = scheduler_class(**scheduler_config)
+        scheduler_eps = scheduler_class(predict_epsilon=False, **scheduler_config)
+
+        kwargs = {}
+        if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+            kwargs["generator"] = torch.Generator().manual_seed(0)
+        output = scheduler.step(residual, time_step, sample, predict_epsilon=False, **kwargs).prev_sample
+
+        kwargs = {}
+        if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
+            kwargs["generator"] = torch.Generator().manual_seed(0)
+        output_eps = scheduler_eps.step(residual, time_step, sample, predict_epsilon=False, **kwargs).prev_sample
+
+        assert (output - output_eps).abs().sum() < 1e-5
+
     def test_time_indices(self):
         for t in [0, 500, 999]:
             self.check_over_forward(time_step=t)

From 598ff76bbf55330c0ab7120c8bf3928ee5b624ef Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathan@huggingface.co>
Date: Wed, 9 Nov 2022 01:06:49 -0800
Subject: [PATCH 158/269] add licenses to pipelines (#1201)

add licenses
---
 .../dance_diffusion/pipeline_dance_diffusion.py   |  1 -
 src/diffusers/pipelines/ddim/pipeline_ddim.py     |  1 -
 src/diffusers/pipelines/ddpm/pipeline_ddpm.py     |  1 -
 .../latent_diffusion/pipeline_latent_diffusion.py | 14 ++++++++++++++
 .../pipeline_latent_diffusion_uncond.py           | 14 ++++++++++++++
 src/diffusers/pipelines/pndm/pipeline_pndm.py     |  1 -
 .../score_sde_ve/pipeline_score_sde_ve.py         | 15 ++++++++++++++-
 .../stable_diffusion/pipeline_cycle_diffusion.py  | 14 ++++++++++++++
 .../pipeline_flax_stable_diffusion.py             | 14 ++++++++++++++
 .../pipeline_onnx_stable_diffusion.py             | 14 ++++++++++++++
 .../pipeline_onnx_stable_diffusion_img2img.py     | 14 ++++++++++++++
 .../pipeline_onnx_stable_diffusion_inpaint.py     | 14 ++++++++++++++
 .../stable_diffusion/pipeline_stable_diffusion.py | 14 ++++++++++++++
 .../pipeline_stable_diffusion_img2img.py          | 14 ++++++++++++++
 .../pipeline_stable_diffusion_inpaint.py          | 14 ++++++++++++++
 .../pipeline_stable_diffusion_inpaint_legacy.py   | 14 ++++++++++++++
 .../pipelines/stable_diffusion/safety_checker.py  | 14 ++++++++++++++
 .../stable_diffusion/safety_checker_flax.py       | 14 ++++++++++++++
 .../pipeline_stochastic_karras_ve.py              | 15 ++++++++++++++-
 19 files changed, 210 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py b/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
index 86f01ec58616..48d16889a030 100644
--- a/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
+++ b/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
@@ -10,7 +10,6 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-
 # limitations under the License.
 
 
diff --git a/src/diffusers/pipelines/ddim/pipeline_ddim.py b/src/diffusers/pipelines/ddim/pipeline_ddim.py
index 733a28c9f388..aab6e68613c4 100644
--- a/src/diffusers/pipelines/ddim/pipeline_ddim.py
+++ b/src/diffusers/pipelines/ddim/pipeline_ddim.py
@@ -10,7 +10,6 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-
 # limitations under the License.
 
 import inspect
diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
index 37d12f2f5df6..3665c68efe1d 100644
--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -10,7 +10,6 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-
 # limitations under the License.
 
 
diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
index cfa39949130e..feb5b00d7438 100644
--- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
@@ -1,3 +1,17 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 from typing import List, Optional, Tuple, Union
 
diff --git a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
index c8da6f193e15..5345c4e5625e 100644
--- a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
+++ b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
@@ -1,3 +1,17 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 from typing import Optional, Tuple, Union
 
diff --git a/src/diffusers/pipelines/pndm/pipeline_pndm.py b/src/diffusers/pipelines/pndm/pipeline_pndm.py
index f360da09ac94..ef7062dea19c 100644
--- a/src/diffusers/pipelines/pndm/pipeline_pndm.py
+++ b/src/diffusers/pipelines/pndm/pipeline_pndm.py
@@ -10,7 +10,6 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-
 # limitations under the License.
 
 
diff --git a/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py b/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
index 7f63820eec28..7eb6a5d3cbd4 100644
--- a/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
+++ b/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
@@ -1,4 +1,17 @@
-#!/usr/bin/env python3
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Optional, Tuple, Union
 
 import torch
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index 3d2ec7d55b98..e6c8372c05a0 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -1,3 +1,17 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 from typing import Callable, List, Optional, Union
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
index 73ec322c6646..02943997d928 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
@@ -1,3 +1,17 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import warnings
 from functools import partial
 from typing import Dict, List, Optional, Union
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
index ecc0b31f3456..3b4689086f27 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
@@ -1,3 +1,17 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 from typing import Callable, List, Optional, Union
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
index f85069b969b7..f167acf23357 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -1,3 +1,17 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 from typing import Callable, List, Optional, Union
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
index f763291c77f0..3aa1cc829956 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -1,3 +1,17 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 from typing import Callable, List, Optional, Union
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 30be4156f92d..23b4673563cf 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -1,3 +1,17 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 from typing import Callable, List, Optional, Union
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index f6f38ab1d376..324878b5ae6c 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -1,3 +1,17 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 from typing import Callable, List, Optional, Union
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 83d27926dc5b..d762dde55bf6 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -1,3 +1,17 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 from typing import Callable, List, Optional, Union
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index a92e23bfc18f..80d087952640 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -1,3 +1,17 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 from typing import Callable, List, Optional, Union
 
diff --git a/src/diffusers/pipelines/stable_diffusion/safety_checker.py b/src/diffusers/pipelines/stable_diffusion/safety_checker.py
index 09d7a3bbf95a..0477c983ead4 100644
--- a/src/diffusers/pipelines/stable_diffusion/safety_checker.py
+++ b/src/diffusers/pipelines/stable_diffusion/safety_checker.py
@@ -1,3 +1,17 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 import torch
 import torch.nn as nn
diff --git a/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py b/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py
index e4ea381a8c9b..e1f669d22b76 100644
--- a/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py
+++ b/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py
@@ -1,3 +1,17 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Optional, Tuple
 
 import jax
diff --git a/src/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py b/src/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
index 9e8864b4ca76..739de8ebe620 100644
--- a/src/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
+++ b/src/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
@@ -1,4 +1,17 @@
-#!/usr/bin/env python3
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Optional, Tuple, Union
 
 import torch

From 24895a1f494062d73028e31880c8848c6a674750 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Wed, 9 Nov 2022 10:28:10 +0100
Subject: [PATCH 159/269] Fix cpu offloading (#1177)

* Fix cpu offloading

* get offloaded devices locally for SD pipelines
---
 src/diffusers/pipeline_utils.py               |  2 -
 .../pipeline_stable_diffusion.py              | 40 +++++++++++-----
 .../pipeline_stable_diffusion_img2img.py      | 37 +++++++++++----
 .../pipeline_stable_diffusion_inpaint.py      | 47 +++++++++++++------
 .../stable_diffusion/test_stable_diffusion.py | 10 ++--
 .../test_stable_diffusion_img2img.py          | 15 ++----
 .../test_stable_diffusion_inpaint.py          | 17 +++----
 7 files changed, 107 insertions(+), 61 deletions(-)

diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 4e7aeb8c9157..5fa120fe2d0b 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -230,8 +230,6 @@ def device(self) -> torch.device:
         for name in module_names.keys():
             module = getattr(self, name)
             if isinstance(module, torch.nn.Module):
-                if module.device == torch.device("meta"):
-                    return torch.device("cpu")
                 return module.device
         return torch.device("cpu")
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 23b4673563cf..c64db5266642 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -195,6 +195,24 @@ def enable_sequential_cpu_offload(self):
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
     @torch.no_grad()
     def __call__(
         self,
@@ -286,6 +304,8 @@ def __call__(
                 f" {type(callback_steps)}."
             )
 
+        device = self._execution_device
+
         # get prompt text embeddings
         text_inputs = self.tokenizer(
             prompt,
@@ -302,7 +322,7 @@ def __call__(
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
             text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+        text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = text_embeddings.shape
@@ -342,7 +362,7 @@ def __call__(
                 truncation=True,
                 return_tensors="pt",
             )
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = uncond_embeddings.shape[1]
@@ -362,20 +382,18 @@ def __call__(
         latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
         latents_dtype = text_embeddings.dtype
         if latents is None:
-            if self.device.type == "mps":
+            if device.type == "mps":
                 # randn does not work reproducibly on mps
-                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
-                    self.device
-                )
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(device)
             else:
-                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+                latents = torch.randn(latents_shape, generator=generator, device=device, dtype=latents_dtype)
         else:
             if latents.shape != latents_shape:
                 raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-            latents = latents.to(self.device)
+            latents = latents.to(device)
 
         # set timesteps and move to the correct device
-        self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps_tensor = self.scheduler.timesteps
 
         # scale the initial noise by the standard deviation required by the scheduler
@@ -424,9 +442,7 @@ def __call__(
         image = image.cpu().permute(0, 2, 3, 1).float().numpy()
 
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
-                self.device
-            )
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
             image, has_nsfw_concept = self.safety_checker(
                 images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
             )
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 324878b5ae6c..f05819b0d914 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -183,6 +183,25 @@ def enable_sequential_cpu_offload(self):
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
     def enable_xformers_memory_efficient_attention(self):
         r"""
         Enable memory efficient attention as implemented in xformers.
@@ -292,6 +311,8 @@ def __call__(
                 f" {type(callback_steps)}."
             )
 
+        device = self._execution_device
+
         # set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
 
@@ -314,7 +335,7 @@ def __call__(
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
             text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+        text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
 
         # duplicate text embeddings for each generation per prompt
         text_embeddings = text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
@@ -348,7 +369,7 @@ def __call__(
                 truncation=True,
                 return_tensors="pt",
             )
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
 
             # duplicate unconditional embeddings for each generation per prompt
             seq_len = uncond_embeddings.shape[1]
@@ -362,7 +383,7 @@ def __call__(
 
         # encode the init image into latents and scale the latents
         latents_dtype = text_embeddings.dtype
-        init_image = init_image.to(device=self.device, dtype=latents_dtype)
+        init_image = init_image.to(device=device, dtype=latents_dtype)
         init_latent_dist = self.vae.encode(init_image).latent_dist
         init_latents = init_latent_dist.sample(generator=generator)
         init_latents = 0.18215 * init_latents
@@ -393,10 +414,10 @@ def __call__(
         init_timestep = min(init_timestep, num_inference_steps)
 
         timesteps = self.scheduler.timesteps[-init_timestep]
-        timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt, device=self.device)
+        timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt, device=device)
 
         # add noise to latents using the timesteps
-        noise = torch.randn(init_latents.shape, generator=generator, device=self.device, dtype=latents_dtype)
+        noise = torch.randn(init_latents.shape, generator=generator, device=device, dtype=latents_dtype)
         init_latents = self.scheduler.add_noise(init_latents, noise, timesteps)
 
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@@ -419,7 +440,7 @@ def __call__(
 
         # Some schedulers like PNDM have timesteps as arrays
         # It's more optimized to move all timesteps to correct device beforehand
-        timesteps = self.scheduler.timesteps[t_start:].to(self.device)
+        timesteps = self.scheduler.timesteps[t_start:].to(device)
 
         for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
@@ -448,9 +469,7 @@ def __call__(
         image = image.cpu().permute(0, 2, 3, 1).numpy()
 
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
-                self.device
-            )
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
             image, has_nsfw_concept = self.safety_checker(
                 images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
             )
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index d762dde55bf6..5ccdd07bb384 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -183,6 +183,25 @@ def enable_sequential_cpu_offload(self):
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
     def enable_xformers_memory_efficient_attention(self):
         r"""
         Enable memory efficient attention as implemented in xformers.
@@ -303,6 +322,8 @@ def __call__(
                 f" {type(callback_steps)}."
             )
 
+        device = self._execution_device
+
         # get prompt text embeddings
         text_inputs = self.tokenizer(
             prompt,
@@ -319,7 +340,7 @@ def __call__(
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
             text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+        text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = text_embeddings.shape
@@ -359,7 +380,7 @@ def __call__(
                 truncation=True,
                 return_tensors="pt",
             )
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = uncond_embeddings.shape[1]
@@ -379,17 +400,15 @@ def __call__(
         latents_shape = (batch_size * num_images_per_prompt, num_channels_latents, height // 8, width // 8)
         latents_dtype = text_embeddings.dtype
         if latents is None:
-            if self.device.type == "mps":
+            if device.type == "mps":
                 # randn does not exist on mps
-                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
-                    self.device
-                )
+                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(device)
             else:
-                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+                latents = torch.randn(latents_shape, generator=generator, device=device, dtype=latents_dtype)
         else:
             if latents.shape != latents_shape:
                 raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-            latents = latents.to(self.device)
+            latents = latents.to(device)
 
         # prepare mask and masked_image
         mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
@@ -398,9 +417,9 @@ def __call__(
         # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
         # and half precision
         mask = torch.nn.functional.interpolate(mask, size=(height // 8, width // 8))
-        mask = mask.to(device=self.device, dtype=text_embeddings.dtype)
+        mask = mask.to(device=device, dtype=text_embeddings.dtype)
 
-        masked_image = masked_image.to(device=self.device, dtype=text_embeddings.dtype)
+        masked_image = masked_image.to(device=device, dtype=text_embeddings.dtype)
 
         # encode the mask image into latents space so we can concatenate it to the latents
         masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
@@ -416,7 +435,7 @@ def __call__(
         )
 
         # aligning device to prevent device errors when concating it with the latent model input
-        masked_image_latents = masked_image_latents.to(device=self.device, dtype=text_embeddings.dtype)
+        masked_image_latents = masked_image_latents.to(device=device, dtype=text_embeddings.dtype)
 
         num_channels_mask = mask.shape[1]
         num_channels_masked_image = masked_image_latents.shape[1]
@@ -431,7 +450,7 @@ def __call__(
             )
 
         # set timesteps and move to the correct device
-        self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps_tensor = self.scheduler.timesteps
 
         # scale the initial noise by the standard deviation required by the scheduler
@@ -484,9 +503,7 @@ def __call__(
         image = image.cpu().permute(0, 2, 3, 1).float().numpy()
 
         if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
-                self.device
-            )
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
             image, has_nsfw_concept = self.safety_checker(
                 images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
             )
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index a83299eaf9f3..252b02806ae0 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -839,20 +839,22 @@ def test_stable_diffusion_low_cpu_mem_usage(self):
 
         assert 2 * low_cpu_mem_usage_time < normal_load_time
 
-    @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU")
     def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
         torch.cuda.empty_cache()
         torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
 
         pipeline_id = "CompVis/stable-diffusion-v1-4"
         prompt = "Andromeda galaxy in a bottle"
 
         pipeline = StableDiffusionPipeline.from_pretrained(pipeline_id, revision="fp16", torch_dtype=torch.float16)
+        pipeline = pipeline.to(torch_device)
         pipeline.enable_attention_slicing(1)
         pipeline.enable_sequential_cpu_offload()
 
-        _ = pipeline(prompt, num_inference_steps=5)
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        _ = pipeline(prompt, generator=generator, num_inference_steps=5)
 
         mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 1.5 GB is allocated
-        assert mem_bytes < 1.5 * 10**9
+        # make sure that less than 2.8 GB is allocated
+        assert mem_bytes < 2.8 * 10**9
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 2d29e1b80644..6d5c6feab5bc 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -603,25 +603,18 @@ def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> No
     def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
         torch.cuda.empty_cache()
         torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
 
         init_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/img2img/sketch-mountains-input.jpg"
         )
-        expected_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/fantasy_landscape_k_lms.png"
-        )
         init_image = init_image.resize((768, 512))
-        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
 
         model_id = "CompVis/stable-diffusion-v1-4"
         lms = LMSDiscreteScheduler.from_config(model_id, subfolder="scheduler")
         pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            model_id,
-            scheduler=lms,
-            safety_checker=None,
-            device_map="auto",
+            model_id, scheduler=lms, safety_checker=None, device_map="auto", revision="fp16", torch_dtype=torch.float16
         )
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
@@ -642,5 +635,5 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
         )
 
         mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 1.5 GB is allocated
-        assert mem_bytes < 1.5 * 10**9
+        # make sure that less than 2.2 GB is allocated
+        assert mem_bytes < 2.2 * 10**9
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index e8dcb43163da..5fcdd71dd6e4 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -384,6 +384,7 @@ def test_stable_diffusion_inpaint_pipeline_pndm(self):
     def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
         torch.cuda.empty_cache()
         torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
 
         init_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
@@ -393,16 +394,16 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
         )
-        expected_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/yellow_cat_sitting_on_a_park_bench_pndm.png"
-        )
-        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
 
         model_id = "runwayml/stable-diffusion-inpainting"
         pndm = PNDMScheduler.from_config(model_id, subfolder="scheduler")
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            model_id, safety_checker=None, scheduler=pndm, device_map="auto"
+            model_id,
+            safety_checker=None,
+            scheduler=pndm,
+            device_map="auto",
+            revision="fp16",
+            torch_dtype=torch.float16,
         )
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
@@ -422,5 +423,5 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
         )
 
         mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 1.5 GB is allocated
-        assert mem_bytes < 1.5 * 10**9
+        # make sure that less than 2.2 GB is allocated
+        assert mem_bytes < 2.2 * 10**9

From 6cf72a9b1ee3125c5ccefcf031401ef8d49d8fe8 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 9 Nov 2022 11:22:12 +0100
Subject: [PATCH 160/269] Fix slow tests (#1210)

* fix tests

* Fix more

* more
---
 .../pipeline_cycle_diffusion.py               |  8 +++--
 src/diffusers/schedulers/scheduling_ddim.py   |  2 +-
 .../scheduling_euler_ancestral_discrete.py    |  2 +-
 .../schedulers/scheduling_euler_discrete.py   |  2 +-
 .../stable_diffusion/test_cycle_diffusion.py  |  8 +++--
 .../stable_diffusion/test_stable_diffusion.py |  4 +--
 tests/test_pipelines.py                       |  7 ++--
 tests/test_scheduler.py                       | 33 ++++++++++++-------
 8 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index e6c8372c05a0..631ceda813ee 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -43,7 +43,7 @@ def preprocess(image):
     return 2.0 * image - 1.0
 
 
-def posterior_sample(scheduler, latents, timestep, clean_latents, eta):
+def posterior_sample(scheduler, latents, timestep, clean_latents, generator, eta):
     # 1. get previous step value (=t-1)
     prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
 
@@ -62,7 +62,9 @@ def posterior_sample(scheduler, latents, timestep, clean_latents, eta):
     # direction pointing to x_t
     e_t = (latents - alpha_prod_t ** (0.5) * clean_latents) / (1 - alpha_prod_t) ** (0.5)
     dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * e_t
-    noise = std_dev_t * torch.randn(clean_latents.shape, dtype=clean_latents.dtype, device=clean_latents.device)
+    noise = std_dev_t * torch.randn(
+        clean_latents.shape, dtype=clean_latents.dtype, device=clean_latents.device, generator=generator
+    )
     prev_latents = alpha_prod_t_prev ** (0.5) * clean_latents + dir_xt + noise
 
     return prev_latents
@@ -499,7 +501,7 @@ def __call__(
 
             # Sample source_latents from the posterior distribution.
             prev_source_latents = posterior_sample(
-                self.scheduler, source_latents, t, clean_latents, **extra_step_kwargs
+                self.scheduler, source_latents, t, clean_latents, generator=generator, **extra_step_kwargs
             )
             # Compute noise.
             noise = compute_noise(
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 1acb81764d32..75cef635d063 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -288,7 +288,7 @@ def step(
 
         if eta > 0:
             # randn_like does not support generator https://github.com/pytorch/pytorch/issues/27072
-            device = model_output.device if torch.is_tensor(model_output) else torch.device("cpu")
+            device = model_output.device
             if variance_noise is not None and generator is not None:
                 raise ValueError(
                     "Cannot pass both generator and variance_noise. Please make sure that either `generator` or"
diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
index 33505c81c004..621b5c17c0e9 100644
--- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -221,7 +221,7 @@ def step(
 
         prev_sample = sample + derivative * dt
 
-        device = model_output.device if torch.is_tensor(model_output) else torch.device("cpu")
+        device = model_output.device
         if device.type == "mps":
             # randn does not work reproducibly on mps
             noise = torch.randn(model_output.shape, dtype=model_output.dtype, device="cpu", generator=generator).to(
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 9f707c27a15b..2f9e938474ef 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -218,7 +218,7 @@ def step(
 
         gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
 
-        device = model_output.device if torch.is_tensor(model_output) else torch.device("cpu")
+        device = model_output.device
         if device.type == "mps":
             # randn does not work reproducibly on mps
             noise = torch.randn(model_output.shape, dtype=model_output.dtype, device="cpu", generator=generator).to(
diff --git a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
index 0bddd63807d6..de918c7e5c22 100644
--- a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
@@ -293,7 +293,7 @@ def test_cycle_diffusion_pipeline_fp16(self):
         source_prompt = "A black colored car"
         prompt = "A blue colored car"
 
-        torch.manual_seed(0)
+        generator = torch.Generator(device=torch_device).manual_seed(0)
         output = pipe(
             prompt=prompt,
             source_prompt=source_prompt,
@@ -303,12 +303,13 @@ def test_cycle_diffusion_pipeline_fp16(self):
             strength=0.85,
             guidance_scale=3,
             source_guidance_scale=1,
+            generator=generator,
             output_type="np",
         )
         image = output.images
 
         # the values aren't exactly equal, but the images look the same visually
-        assert np.abs(image - expected_image).max() < 1e-2
+        assert np.abs(image - expected_image).max() < 5e-1
 
     def test_cycle_diffusion_pipeline(self):
         init_image = load_image(
@@ -331,7 +332,7 @@ def test_cycle_diffusion_pipeline(self):
         source_prompt = "A black colored car"
         prompt = "A blue colored car"
 
-        torch.manual_seed(0)
+        generator = torch.Generator(device=torch_device).manual_seed(0)
         output = pipe(
             prompt=prompt,
             source_prompt=source_prompt,
@@ -341,6 +342,7 @@ def test_cycle_diffusion_pipeline(self):
             strength=0.85,
             guidance_scale=3,
             source_guidance_scale=1,
+            generator=generator,
             output_type="np",
         )
         image = output.images
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 252b02806ae0..6e1071124cb7 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -755,7 +755,7 @@ def test_stable_diffusion_text2img_pipeline_fp16(self):
 
     def test_stable_diffusion_text2img_pipeline_default(self):
         expected_image = load_numpy(
-            "https://huggingface.co/datasets/lewington/expected-images/resolve/main/astronaut_riding_a_horse.npy"
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text2img/astronaut_riding_a_horse.npy"
         )
 
         model_id = "CompVis/stable-diffusion-v1-4"
@@ -771,7 +771,7 @@ def test_stable_diffusion_text2img_pipeline_default(self):
         image = output.images[0]
 
         assert image.shape == (512, 512, 3)
-        assert np.abs(expected_image - image).max() < 1e-3
+        assert np.abs(expected_image - image).max() < 5e-3
 
     def test_stable_diffusion_text2img_intermediate_state(self):
         number_of_steps = 0
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index db3e804e674c..753c821dd315 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -442,7 +442,8 @@ def test_from_pretrained_hub_pass_model(self):
     def test_output_format(self):
         model_path = "google/ddpm-cifar10-32"
 
-        pipe = DDIMPipeline.from_pretrained(model_path)
+        scheduler = DDIMScheduler.from_config(model_path)
+        pipe = DDIMPipeline.from_pretrained(model_path, scheduler=scheduler)
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
@@ -451,13 +452,13 @@ def test_output_format(self):
         assert images.shape == (1, 32, 32, 3)
         assert isinstance(images, np.ndarray)
 
-        images = pipe(generator=generator, output_type="pil").images
+        images = pipe(generator=generator, output_type="pil", num_inference_steps=4).images
         assert isinstance(images, list)
         assert len(images) == 1
         assert isinstance(images[0], PIL.Image.Image)
 
         # use PIL by default
-        images = pipe(generator=generator).images
+        images = pipe(generator=generator, num_inference_steps=4).images
         assert isinstance(images, list)
         assert isinstance(images[0], PIL.Image.Image)
 
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 234e1185b439..ab5217151125 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -1281,10 +1281,11 @@ def test_full_loop_no_noise(self):
 
         scheduler.set_timesteps(self.num_inference_steps)
 
-        generator = torch.Generator().manual_seed(0)
+        generator = torch.Generator(torch_device).manual_seed(0)
 
         model = self.dummy_model()
         sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
 
         for i, t in enumerate(scheduler.timesteps):
             sample = scheduler.scale_model_input(sample, t)
@@ -1296,7 +1297,6 @@ def test_full_loop_no_noise(self):
 
         result_sum = torch.sum(torch.abs(sample))
         result_mean = torch.mean(torch.abs(sample))
-        print(result_sum, result_mean)
 
         assert abs(result_sum.item() - 10.0807) < 1e-2
         assert abs(result_mean.item() - 0.0131) < 1e-3
@@ -1308,7 +1308,7 @@ def test_full_loop_device(self):
 
         scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
 
-        generator = torch.Generator().manual_seed(0)
+        generator = torch.Generator(torch_device).manual_seed(0)
 
         model = self.dummy_model()
         sample = self.dummy_sample_deter * scheduler.init_noise_sigma
@@ -1324,7 +1324,6 @@ def test_full_loop_device(self):
 
         result_sum = torch.sum(torch.abs(sample))
         result_mean = torch.mean(torch.abs(sample))
-        print(result_sum, result_mean)
 
         assert abs(result_sum.item() - 10.0807) < 1e-2
         assert abs(result_mean.item() - 0.0131) < 1e-3
@@ -1365,10 +1364,11 @@ def test_full_loop_no_noise(self):
 
         scheduler.set_timesteps(self.num_inference_steps)
 
-        generator = torch.Generator().manual_seed(0)
+        generator = torch.Generator(device=torch_device).manual_seed(0)
 
         model = self.dummy_model()
         sample = self.dummy_sample_deter * scheduler.init_noise_sigma
+        sample = sample.to(torch_device)
 
         for i, t in enumerate(scheduler.timesteps):
             sample = scheduler.scale_model_input(sample, t)
@@ -1380,9 +1380,14 @@ def test_full_loop_no_noise(self):
 
         result_sum = torch.sum(torch.abs(sample))
         result_mean = torch.mean(torch.abs(sample))
-        print(result_sum, result_mean)
-        assert abs(result_sum.item() - 152.3192) < 1e-2
-        assert abs(result_mean.item() - 0.1983) < 1e-3
+
+        if str(torch_device).startswith("cpu"):
+            assert abs(result_sum.item() - 152.3192) < 1e-2
+            assert abs(result_mean.item() - 0.1983) < 1e-3
+        else:
+            # CUDA
+            assert abs(result_sum.item() - 144.8084) < 1e-2
+            assert abs(result_mean.item() - 0.18855) < 1e-3
 
     def test_full_loop_device(self):
         scheduler_class = self.scheduler_classes[0]
@@ -1391,7 +1396,7 @@ def test_full_loop_device(self):
 
         scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
 
-        generator = torch.Generator().manual_seed(0)
+        generator = torch.Generator(device=torch_device).manual_seed(0)
 
         model = self.dummy_model()
         sample = self.dummy_sample_deter * scheduler.init_noise_sigma
@@ -1407,14 +1412,18 @@ def test_full_loop_device(self):
 
         result_sum = torch.sum(torch.abs(sample))
         result_mean = torch.mean(torch.abs(sample))
-        print(result_sum, result_mean)
-        if not str(torch_device).startswith("mps"):
+
+        if str(torch_device).startswith("cpu"):
             # The following sum varies between 148 and 156 on mps. Why?
             assert abs(result_sum.item() - 152.3192) < 1e-2
             assert abs(result_mean.item() - 0.1983) < 1e-3
-        else:
+        elif str(torch_device).startswith("mps"):
             # Larger tolerance on mps
             assert abs(result_mean.item() - 0.1983) < 1e-2
+        else:
+            # CUDA
+            assert abs(result_sum.item() - 144.8084) < 1e-2
+            assert abs(result_mean.item() - 0.18855) < 1e-3
 
 
 class IPNDMSchedulerTest(SchedulerCommonTest):

From 663f0c19632a245dc09cff5b5a3ea088ad33e1f2 Mon Sep 17 00:00:00 2001
From: camenduru <54370274+camenduru@users.noreply.github.com>
Date: Wed, 9 Nov 2022 13:34:15 +0300
Subject: [PATCH 161/269] =?UTF-8?q?[Flax]=20fix=20extra=20copy=20pasta=20?=
 =?UTF-8?q?=F0=9F=8D=9D=20(#1187)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/dreambooth/train_dreambooth_flax.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/examples/dreambooth/train_dreambooth_flax.py b/examples/dreambooth/train_dreambooth_flax.py
index 078a66e4acee..6606af4f17f7 100644
--- a/examples/dreambooth/train_dreambooth_flax.py
+++ b/examples/dreambooth/train_dreambooth_flax.py
@@ -327,22 +327,6 @@ def main():
     if args.seed is not None:
         set_seed(args.seed)
 
-    if jax.process_index() == 0:
-        if args.push_to_hub:
-            if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
-            else:
-                repo_name = args.hub_model_id
-            repo = Repository(args.output_dir, clone_from=repo_name)
-
-            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
-                if "step_*" not in gitignore:
-                    gitignore.write("step_*\n")
-                if "epoch_*" not in gitignore:
-                    gitignore.write("epoch_*\n")
-        elif args.output_dir is not None:
-            os.makedirs(args.output_dir, exist_ok=True)
-
     rng = jax.random.PRNGKey(args.seed)
 
     if args.with_prior_preservation:

From cd77a0365196a09a60bd80a4e7a85fbdb03c531f Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Wed, 9 Nov 2022 11:46:12 +0100
Subject: [PATCH 162/269] [CLIPGuidedStableDiffusion] support DDIM scheduler
 (#1190)

add ddim in clip guided
---
 .../community/clip_guided_stable_diffusion.py | 30 ++++++++++++++++---
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/examples/community/clip_guided_stable_diffusion.py b/examples/community/clip_guided_stable_diffusion.py
index 2c86e9130fdc..14d9ee63226d 100644
--- a/examples/community/clip_guided_stable_diffusion.py
+++ b/examples/community/clip_guided_stable_diffusion.py
@@ -5,7 +5,14 @@
 from torch import nn
 from torch.nn import functional as F
 
-from diffusers import AutoencoderKL, DiffusionPipeline, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DiffusionPipeline,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+)
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
 from torchvision import transforms
 from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextModel, CLIPTokenizer
@@ -56,7 +63,7 @@ def __init__(
         clip_model: CLIPModel,
         tokenizer: CLIPTokenizer,
         unet: UNet2DConditionModel,
-        scheduler: Union[PNDMScheduler, LMSDiscreteScheduler],
+        scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler],
         feature_extractor: CLIPFeatureExtractor,
     ):
         super().__init__()
@@ -123,7 +130,7 @@ def cond_fn(
         # predict the noise residual
         noise_pred = self.unet(latent_model_input, timestep, encoder_hidden_states=text_embeddings).sample
 
-        if isinstance(self.scheduler, PNDMScheduler):
+        if isinstance(self.scheduler, (PNDMScheduler, DDIMScheduler)):
             alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
             beta_prod_t = 1 - alpha_prod_t
             # compute predicted original sample from predicted noise also called
@@ -176,6 +183,7 @@ def __call__(
         num_inference_steps: Optional[int] = 50,
         guidance_scale: Optional[float] = 7.5,
         num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
         clip_guidance_scale: Optional[float] = 100,
         clip_prompt: Optional[Union[str, List[str]]] = None,
         num_cutouts: Optional[int] = 4,
@@ -275,6 +283,20 @@ def __call__(
         # scale the initial noise by the standard deviation required by the scheduler
         latents = latents * self.scheduler.init_noise_sigma
 
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
@@ -306,7 +328,7 @@ def __call__(
                 )
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
 
         # scale and decode the image latents with vae
         latents = 1 / 0.18215 * latents

From 3f7edc5f724862cce8d43bca1b531d962e963a3a Mon Sep 17 00:00:00 2001
From: "Duong A. Nguyen" <38061659+duongna21@users.noreply.github.com>
Date: Wed, 9 Nov 2022 18:08:30 +0700
Subject: [PATCH 163/269] Fix layer names convert LDM script (#1206)

fix script convert LDM
---
 ...rt_ldm_original_checkpoint_to_diffusers.py | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/scripts/convert_ldm_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_original_checkpoint_to_diffusers.py
index 52865792256f..f547e96f4ed5 100644
--- a/scripts/convert_ldm_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ldm_original_checkpoint_to_diffusers.py
@@ -112,9 +112,9 @@ def assign_to_checkpoint(
             continue
 
         # Global renaming happens here
-        new_path = new_path.replace("middle_block.0", "mid.resnets.0")
-        new_path = new_path.replace("middle_block.1", "mid.attentions.0")
-        new_path = new_path.replace("middle_block.2", "mid.resnets.1")
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
 
         if additional_replacements is not None:
             for replacement in additional_replacements:
@@ -175,15 +175,16 @@ def convert_ldm_checkpoint(checkpoint, config):
         attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
 
         if f"input_blocks.{i}.0.op.weight" in checkpoint:
-            new_checkpoint[f"downsample_blocks.{block_id}.downsamplers.0.conv.weight"] = checkpoint[
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = checkpoint[
                 f"input_blocks.{i}.0.op.weight"
             ]
-            new_checkpoint[f"downsample_blocks.{block_id}.downsamplers.0.conv.bias"] = checkpoint[
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = checkpoint[
                 f"input_blocks.{i}.0.op.bias"
             ]
+            continue
 
         paths = renew_resnet_paths(resnets)
-        meta_path = {"old": f"input_blocks.{i}.0", "new": f"downsample_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
         resnet_op = {"old": "resnets.2.op", "new": "downsamplers.0.op"}
         assign_to_checkpoint(
             paths, new_checkpoint, checkpoint, additional_replacements=[meta_path, resnet_op], config=config
@@ -193,18 +194,18 @@ def convert_ldm_checkpoint(checkpoint, config):
             paths = renew_attention_paths(attentions)
             meta_path = {
                 "old": f"input_blocks.{i}.1",
-                "new": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}",
+                "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}",
             }
             to_split = {
                 f"input_blocks.{i}.1.qkv.bias": {
-                    "key": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias",
-                    "query": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias",
-                    "value": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias",
+                    "key": f"down_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias",
+                    "query": f"down_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias",
+                    "value": f"down_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias",
                 },
                 f"input_blocks.{i}.1.qkv.weight": {
-                    "key": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight",
-                    "query": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight",
-                    "value": f"downsample_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight",
+                    "key": f"down_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight",
+                    "query": f"down_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight",
+                    "value": f"down_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight",
                 },
             }
             assign_to_checkpoint(

From b93fe085459ae0327e9177a6f871172f1cebceab Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 9 Nov 2022 12:28:56 +0100
Subject: [PATCH 164/269] [Loading] Make sure loading edge cases work (#1192)

* [Loading] Make edge cases work

* up

* finish

* up
---
 src/diffusers/pipeline_flax_utils.py | 14 +++++----
 src/diffusers/pipeline_utils.py      | 15 ++++++----
 tests/test_pipelines.py              | 44 ++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+), 12 deletions(-)

diff --git a/src/diffusers/pipeline_flax_utils.py b/src/diffusers/pipeline_flax_utils.py
index e63009b49c8f..3963c80cd1cd 100644
--- a/src/diffusers/pipeline_flax_utils.py
+++ b/src/diffusers/pipeline_flax_utils.py
@@ -55,6 +55,8 @@
         "PreTrainedTokenizerFast": ["save_pretrained", "from_pretrained"],
         "FlaxPreTrainedModel": ["save_pretrained", "from_pretrained"],
         "FeatureExtractionMixin": ["save_pretrained", "from_pretrained"],
+        "ProcessorMixin": ["save_pretrained", "from_pretrained"],
+        "ImageProcessingMixin": ["save_pretrained", "from_pretrained"],
     },
 }
 
@@ -172,8 +174,8 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], params: Union
             for library_name, library_classes in LOADABLE_CLASSES.items():
                 library = importlib.import_module(library_name)
                 for base_class, save_load_methods in library_classes.items():
-                    class_candidate = getattr(library, base_class)
-                    if issubclass(model_cls, class_candidate):
+                    class_candidate = getattr(library, base_class, None)
+                    if class_candidate is not None and issubclass(model_cls, class_candidate):
                         # if we found a suitable base class in LOADABLE_CLASSES then grab its save method
                         save_method_name = save_load_methods[0]
                         break
@@ -387,11 +389,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     library = importlib.import_module(library_name)
                     class_obj = getattr(library, class_name)
                     importable_classes = LOADABLE_CLASSES[library_name]
-                    class_candidates = {c: getattr(library, c) for c in importable_classes.keys()}
+                    class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
 
                     expected_class_obj = None
                     for class_name, class_candidate in class_candidates.items():
-                        if issubclass(class_obj, class_candidate):
+                        if class_candidate is not None and issubclass(class_obj, class_candidate):
                             expected_class_obj = class_candidate
 
                     if not issubclass(passed_class_obj[name].__class__, expected_class_obj):
@@ -425,12 +427,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 class_obj = import_flax_or_no_model(library, class_name)
 
                 importable_classes = LOADABLE_CLASSES[library_name]
-                class_candidates = {c: getattr(library, c) for c in importable_classes.keys()}
+                class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
 
             if loaded_sub_model is None and sub_model_should_be_defined:
                 load_method_name = None
                 for class_name, class_candidate in class_candidates.items():
-                    if issubclass(class_obj, class_candidate):
+                    if class_candidate is not None and issubclass(class_obj, class_candidate):
                         load_method_name = importable_classes[class_name][1]
 
                 load_method = getattr(class_obj, load_method_name)
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 5fa120fe2d0b..a194f3eb34d9 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -74,6 +74,8 @@
         "PreTrainedTokenizerFast": ["save_pretrained", "from_pretrained"],
         "PreTrainedModel": ["save_pretrained", "from_pretrained"],
         "FeatureExtractionMixin": ["save_pretrained", "from_pretrained"],
+        "ProcessorMixin": ["save_pretrained", "from_pretrained"],
+        "ImageProcessingMixin": ["save_pretrained", "from_pretrained"],
     },
 }
 
@@ -190,8 +192,8 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike]):
             for library_name, library_classes in LOADABLE_CLASSES.items():
                 library = importlib.import_module(library_name)
                 for base_class, save_load_methods in library_classes.items():
-                    class_candidate = getattr(library, base_class)
-                    if issubclass(model_cls, class_candidate):
+                    class_candidate = getattr(library, base_class, None)
+                    if class_candidate is not None and issubclass(model_cls, class_candidate):
                         # if we found a suitable base class in LOADABLE_CLASSES then grab its save method
                         save_method_name = save_load_methods[0]
                         break
@@ -543,11 +545,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     library = importlib.import_module(library_name)
                     class_obj = getattr(library, class_name)
                     importable_classes = LOADABLE_CLASSES[library_name]
-                    class_candidates = {c: getattr(library, c) for c in importable_classes.keys()}
+                    class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
 
                     expected_class_obj = None
                     for class_name, class_candidate in class_candidates.items():
-                        if issubclass(class_obj, class_candidate):
+                        if class_candidate is not None and issubclass(class_obj, class_candidate):
                             expected_class_obj = class_candidate
 
                     if not issubclass(passed_class_obj[name].__class__, expected_class_obj):
@@ -577,14 +579,15 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             else:
                 # else we just import it from the library.
                 library = importlib.import_module(library_name)
+
                 class_obj = getattr(library, class_name)
                 importable_classes = LOADABLE_CLASSES[library_name]
-                class_candidates = {c: getattr(library, c) for c in importable_classes.keys()}
+                class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
 
             if loaded_sub_model is None and sub_model_should_be_defined:
                 load_method_name = None
                 for class_name, class_candidate in class_candidates.items():
-                    if issubclass(class_obj, class_candidate):
+                    if class_candidate is not None and issubclass(class_obj, class_candidate):
                         load_method_name = importable_classes[class_name][1]
 
                 if load_method_name is None:
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 753c821dd315..da5bd3c24417 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -88,6 +88,50 @@ def test_download_only_pytorch(self):
             # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe/blob/main/unet/diffusion_flax_model.msgpack
             assert not any(f.endswith(".msgpack") for f in files)
 
+    def test_download_no_safety_checker(self):
+        prompt = "hello"
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+        )
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
+
+        pipe_2 = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
+        generator_2 = torch.Generator(device=torch_device).manual_seed(0)
+        out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator_2, output_type="numpy").images
+
+        assert np.max(np.abs(out - out_2)) < 1e-3
+
+    def test_load_no_safety_checker_explicit_locally(self):
+        prompt = "hello"
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
+        )
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname, safety_checker=None)
+            generator_2 = torch.Generator(device=torch_device).manual_seed(0)
+            out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator_2, output_type="numpy").images
+
+        assert np.max(np.abs(out - out_2)) < 1e-3
+
+    def test_load_no_safety_checker_default_locally(self):
+        prompt = "hello"
+        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname)
+            generator_2 = torch.Generator(device=torch_device).manual_seed(0)
+            out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator_2, output_type="numpy").images
+
+        assert np.max(np.abs(out - out_2)) < 1e-3
+
 
 class CustomPipelineTests(unittest.TestCase):
     def test_load_custom_pipeline(self):

From 5a59f9b7179def4919dd852cd11b822dae76375b Mon Sep 17 00:00:00 2001
From: "Duong A. Nguyen" <38061659+duongna21@users.noreply.github.com>
Date: Wed, 9 Nov 2022 19:42:16 +0700
Subject: [PATCH 165/269] Add LDM Super Resolution pipeline (#1116)

* Add ldm super resolution pipeline

* style

* fix copies

* style

* fix doc

* Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py

Co-authored-by: Suraj Patil <surajp815@gmail.com>

* Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py

Co-authored-by: Suraj Patil <surajp815@gmail.com>

* add doc

* address comments

* address comments

* fix doc

* minor

* add tests

* add tests

* load text encoder from subfolder

* fix test

* fix test

* style

* style

* handle mps latents

* unfix typo

* unfix typo

* Update tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

* fix set_timesteps mps

* fix set_timesteps mps

* Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py

Co-authored-by: Suraj Patil <surajp815@gmail.com>

* Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py

Co-authored-by: Suraj Patil <surajp815@gmail.com>

* Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py

Co-authored-by: Suraj Patil <surajp815@gmail.com>

* Update src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py

Co-authored-by: Suraj Patil <surajp815@gmail.com>

* style

* test 64x64 instead of 256x256

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
Co-authored-by: Suraj Patil <surajp815@gmail.com>
Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 .../source/api/pipelines/latent_diffusion.mdx |   5 +
 src/diffusers/__init__.py                     |   1 +
 src/diffusers/pipelines/__init__.py           |   1 +
 .../pipelines/latent_diffusion/__init__.py    |   1 +
 ...peline_latent_diffusion_superresolution.py | 169 ++++++++++++++++++
 src/diffusers/utils/dummy_pt_objects.py       |  15 ++
 .../test_latent_diffusion_superresolution.py  | 118 ++++++++++++
 7 files changed, 310 insertions(+)
 create mode 100644 src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
 create mode 100644 tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py

diff --git a/docs/source/api/pipelines/latent_diffusion.mdx b/docs/source/api/pipelines/latent_diffusion.mdx
index 6d63cd5cbe5b..4ade13e67b15 100644
--- a/docs/source/api/pipelines/latent_diffusion.mdx
+++ b/docs/source/api/pipelines/latent_diffusion.mdx
@@ -33,6 +33,7 @@ The original codebase can be found [here](https://github.com/CompVis/latent-diff
 | Pipeline | Tasks | Colab
 |---|---|:---:|
 | [pipeline_latent_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py) | *Text-to-Image Generation* | - |
+| [pipeline_latent_diffusion_superresolution.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py) | *Super Resolution* | - |
 
 ## Examples:
 
@@ -40,3 +41,7 @@ The original codebase can be found [here](https://github.com/CompVis/latent-diff
 ## LDMTextToImagePipeline
 [[autodoc]] pipelines.latent_diffusion.pipeline_latent_diffusion.LDMTextToImagePipeline
     - __call__
+
+## LDMSuperResolutionPipeline
+[[autodoc]] pipelines.latent_diffusion.pipeline_latent_diffusion_superresolution.LDMSuperResolutionPipeline
+    - __call__
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index da56dc888138..86eda7371fe9 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -35,6 +35,7 @@
         DDPMPipeline,
         KarrasVePipeline,
         LDMPipeline,
+        LDMSuperResolutionPipeline,
         PNDMPipeline,
         RePaintPipeline,
         ScoreSdeVePipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index eb0635f6ee07..ef4d23e5e6d0 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -5,6 +5,7 @@
     from .dance_diffusion import DanceDiffusionPipeline
     from .ddim import DDIMPipeline
     from .ddpm import DDPMPipeline
+    from .latent_diffusion import LDMSuperResolutionPipeline
     from .latent_diffusion_uncond import LDMPipeline
     from .pndm import PNDMPipeline
     from .repaint import RePaintPipeline
diff --git a/src/diffusers/pipelines/latent_diffusion/__init__.py b/src/diffusers/pipelines/latent_diffusion/__init__.py
index c481b38cf5e0..5544527ff587 100644
--- a/src/diffusers/pipelines/latent_diffusion/__init__.py
+++ b/src/diffusers/pipelines/latent_diffusion/__init__.py
@@ -1,5 +1,6 @@
 # flake8: noqa
 from ...utils import is_transformers_available
+from .pipeline_latent_diffusion_superresolution import LDMSuperResolutionPipeline
 
 
 if is_transformers_available():
diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
new file mode 100644
index 000000000000..044ff359e317
--- /dev/null
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
@@ -0,0 +1,169 @@
+import inspect
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+
+import PIL
+
+from ...models import UNet2DModel, VQModel
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+
+
+def preprocess(image):
+    w, h = image.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+
+class LDMSuperResolutionPipeline(DiffusionPipeline):
+    r"""
+    A pipeline for image super-resolution using Latent
+
+    This class inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Parameters:
+        vqvae ([`VQModel`]):
+            Vector-quantized (VQ) VAE Model to encode and decode images to and from latent representations.
+        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`],
+            [`EulerAncestralDiscreteScheduler`], [`DPMSolverMultistepScheduler`], or [`PNDMScheduler`].
+    """
+
+    def __init__(
+        self,
+        vqvae: VQModel,
+        unet: UNet2DModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+    ):
+        super().__init__()
+        self.register_modules(vqvae=vqvae, unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        init_image: Union[torch.Tensor, PIL.Image.Image],
+        batch_size: Optional[int] = 1,
+        num_inference_steps: Optional[int] = 100,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[Tuple, ImagePipelineOutput]:
+        r"""
+        Args:
+            init_image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            batch_size (`int`, *optional*, defaults to 1):
+                Number of images to generate.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+
+        if isinstance(init_image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(init_image, torch.Tensor):
+            batch_size = init_image.shape[0]
+        else:
+            raise ValueError(
+                f"`init_image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(init_image)}"
+            )
+
+        if isinstance(init_image, PIL.Image.Image):
+            init_image = preprocess(init_image)
+
+        height, width = init_image.shape[-2:]
+
+        # in_channels should be 6: 3 for latents, 3 for low resolution image
+        latents_shape = (batch_size, self.unet.in_channels // 2, height, width)
+        latents_dtype = next(self.unet.parameters()).dtype
+
+        if self.device.type == "mps":
+            # randn does not work reproducibly on mps
+            latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype)
+            latents = latents.to(self.device)
+        else:
+            latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+
+        init_image = init_image.to(device=self.device, dtype=latents_dtype)
+
+        # set timesteps and move to the correct device
+        self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+        timesteps_tensor = self.scheduler.timesteps
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature.
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_kwargs = {}
+        if accepts_eta:
+            extra_kwargs["eta"] = eta
+
+        for t in self.progress_bar(timesteps_tensor):
+            # concat latents and low resolution image in the channel dimension.
+            latents_input = torch.cat([latents, init_image], dim=1)
+            latents_input = self.scheduler.scale_model_input(latents_input, t)
+            # predict the noise residual
+            noise_pred = self.unet(latents_input, t).sample
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_kwargs).prev_sample
+
+        # decode the image latents with the VQVAE
+        image = self.vqvae.decode(latents).sample
+        image = torch.clamp(image, -1.0, 1.0)
+        image = image / 2 + 0.5
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 9d296d29977d..af2e0c7c61d6 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -227,6 +227,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class LDMSuperResolutionPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class PNDMPipeline(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
new file mode 100644
index 000000000000..f5ec56d1bdc2
--- /dev/null
+++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
@@ -0,0 +1,118 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+import torch
+
+import PIL
+from diffusers import DDIMScheduler, LDMSuperResolutionPipeline, UNet2DModel, VQModel
+from diffusers.utils import floats_tensor, load_image, slow, torch_device
+from diffusers.utils.testing_utils import require_torch
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class LDMSuperResolutionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    @property
+    def dummy_image(self):
+        batch_size = 1
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
+        return image
+
+    @property
+    def dummy_uncond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=6,
+            out_channels=3,
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
+        )
+        return model
+
+    @property
+    def dummy_vq_model(self):
+        torch.manual_seed(0)
+        model = VQModel(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=3,
+        )
+        return model
+
+    def test_inference_superresolution(self):
+        unet = self.dummy_uncond_unet
+        scheduler = DDIMScheduler()
+        vqvae = self.dummy_vq_model
+
+        ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler)
+        ldm.to(torch_device)
+        ldm.set_progress_bar_config(disable=None)
+
+        init_image = self.dummy_image.to(torch_device)
+
+        # Warmup pass when using mps (see #372)
+        if torch_device == "mps":
+            generator = torch.manual_seed(0)
+            _ = ldm(init_image, generator=generator, num_inference_steps=1, output_type="numpy").images
+
+        generator = torch.manual_seed(0)
+        image = ldm(init_image, generator=generator, num_inference_steps=2, output_type="numpy").images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.8634, 0.8186, 0.6416, 0.6846, 0.4427, 0.5676, 0.4679, 0.6247, 0.5176])
+        tolerance = 1e-2 if torch_device != "mps" else 3e-2
+        assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance
+
+
+@slow
+@require_torch
+class LDMSuperResolutionPipelineIntegrationTests(unittest.TestCase):
+    def test_inference_superresolution(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/vq_diffusion/teddy_bear_pool.png"
+        )
+        init_image = init_image.resize((64, 64), resample=PIL.Image.LANCZOS)
+
+        ldm = LDMSuperResolutionPipeline.from_pretrained("duongna/ldm-super-resolution", device_map="auto")
+        ldm.to(torch_device)
+        ldm.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = ldm(init_image, generator=generator, num_inference_steps=20, output_type="numpy").images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.7418, 0.7472, 0.7424, 0.7422, 0.7463, 0.726, 0.7382, 0.7248, 0.6828])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

From 0248541deadfa187150fe7f96a575ff905ecddd7 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 9 Nov 2022 15:46:08 +0100
Subject: [PATCH 166/269] [Conversion] Improve conversion script (#1218)

up
---
 ...vert_original_stable_diffusion_to_diffusers.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py
index 46073001f126..375b12b6f88b 100644
--- a/scripts/convert_original_stable_diffusion_to_diffusers.py
+++ b/scripts/convert_original_stable_diffusion_to_diffusers.py
@@ -30,6 +30,9 @@
 from diffusers import (
     AutoencoderKL,
     DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
     LDMTextToImagePipeline,
     LMSDiscreteScheduler,
     PNDMScheduler,
@@ -647,7 +650,7 @@ def convert_ldm_clip_checkpoint(checkpoint):
         "--scheduler_type",
         default="pndm",
         type=str,
-        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim']",
+        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancest', 'dpm']",
     )
     parser.add_argument(
         "--extract_ema",
@@ -686,6 +689,16 @@ def convert_ldm_clip_checkpoint(checkpoint):
         )
     elif args.scheduler_type == "lms":
         scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
+    elif args.scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
+    elif args.scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler(
+            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
+        )
+    elif args.scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler(
+            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
+        )
     elif args.scheduler_type == "ddim":
         scheduler = DDIMScheduler(
             beta_start=beta_start,

From 6c0335c7f95b85481777445ac96d9817364ba6ae Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 9 Nov 2022 16:02:11 +0100
Subject: [PATCH 167/269] DDIM docs (#1219)

---
 docs/source/api/pipelines/ddim.mdx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/api/pipelines/ddim.mdx b/docs/source/api/pipelines/ddim.mdx
index 2bb9def76ed8..a7a5421b36fe 100644
--- a/docs/source/api/pipelines/ddim.mdx
+++ b/docs/source/api/pipelines/ddim.mdx
@@ -20,7 +20,8 @@ The abstract of the paper is the following:
 
 Denoising diffusion probabilistic models (DDPMs) have achieved high quality image generation without adversarial training, yet they require simulating a Markov chain for many steps to produce a sample. To accelerate sampling, we present denoising diffusion implicit models (DDIMs), a more efficient class of iterative implicit probabilistic models with the same training procedure as DDPMs. In DDPMs, the generative process is defined as the reverse of a Markovian diffusion process. We construct a class of non-Markovian diffusion processes that lead to the same training objective, but whose reverse process can be much faster to sample from. We empirically demonstrate that DDIMs can produce high quality samples 10× to 50× faster in terms of wall-clock time compared to DDPMs, allow us to trade off computation for sample quality, and can perform semantically meaningful image interpolation directly in the latent space.
 
-The original codebase of this paper can be found [here](https://github.com/ermongroup/ddim).
+The original codebase of this paper can be found here: [ermongroup/ddim](https://github.com/ermongroup/ddim).
+For questions, feel free to contact the author on [tsong.me](https://tsong.me/).
 
 ## Available Pipelines:
 

From 4969f46511ef07e81b9b7429cd0fb88d1926849c Mon Sep 17 00:00:00 2001
From: Jesse Casey <31020859+jncasey@users.noreply.github.com>
Date: Wed, 9 Nov 2022 14:01:31 -0500
Subject: [PATCH 168/269] apply `repeat_interleave` fix for `mps` to stable
 diffusion image2image pipeline (#1135)

copy from other pipeline
---
 .../stable_diffusion/pipeline_stable_diffusion_img2img.py   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index f05819b0d914..6bf2babbfef1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -337,8 +337,10 @@ def __call__(
             text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
         text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
 
-        # duplicate text embeddings for each generation per prompt
-        text_embeddings = text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`

From af279434d03e6e3be7808ecd15c652338b31024b Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Wed, 9 Nov 2022 20:04:43 +0100
Subject: [PATCH 169/269] Flax tests: don't hardcode number of devices (#1175)

Flax tests: don't hardcode number of devices.

This makes it possible to test on CPU/GPU. However, expected slices are
only checked when there are 8 devices.
---
 tests/test_pipelines_flax.py | 47 ++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/tests/test_pipelines_flax.py b/tests/test_pipelines_flax.py
index ac5e2621a514..72316aad9211 100644
--- a/tests/test_pipelines_flax.py
+++ b/tests/test_pipelines_flax.py
@@ -73,18 +73,19 @@ def test_dummy_all_tpus(self):
 
         # shard inputs and rng
         params = replicate(params)
-        prng_seed = jax.random.split(prng_seed, 8)
+        prng_seed = jax.random.split(prng_seed, num_samples)
         prompt_ids = shard(prompt_ids)
 
         images = p_sample(prompt_ids, params, prng_seed, num_inference_steps).images
 
-        assert images.shape == (8, 1, 128, 128, 3)
-        assert np.abs(np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 3.1111548) < 1e-3
-        assert np.abs(np.abs(images, dtype=np.float32).sum() - 199746.95) < 5e-1
+        assert images.shape == (num_samples, 1, 128, 128, 3)
+        if jax.device_count() == 8:
+            assert np.abs(np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 3.1111548) < 1e-3
+            assert np.abs(np.abs(images, dtype=np.float32).sum() - 199746.95) < 5e-1
 
         images_pil = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
 
-        assert len(images_pil) == 8
+        assert len(images_pil) == num_samples
 
     def test_stable_diffusion_v1_4(self):
         pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
@@ -107,14 +108,15 @@ def test_stable_diffusion_v1_4(self):
 
         # shard inputs and rng
         params = replicate(params)
-        prng_seed = jax.random.split(prng_seed, 8)
+        prng_seed = jax.random.split(prng_seed, num_samples)
         prompt_ids = shard(prompt_ids)
 
         images = p_sample(prompt_ids, params, prng_seed, num_inference_steps).images
 
-        assert images.shape == (8, 1, 512, 512, 3)
-        assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.05652401)) < 1e-3
-        assert np.abs((np.abs(images, dtype=np.float32).sum() - 2383808.2)) < 5e-1
+        assert images.shape == (num_samples, 1, 512, 512, 3)
+        if jax.device_count() == 8:
+            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.05652401)) < 1e-3
+            assert np.abs((np.abs(images, dtype=np.float32).sum() - 2383808.2)) < 5e-1
 
     def test_stable_diffusion_v1_4_bfloat_16(self):
         pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
@@ -137,14 +139,15 @@ def test_stable_diffusion_v1_4_bfloat_16(self):
 
         # shard inputs and rng
         params = replicate(params)
-        prng_seed = jax.random.split(prng_seed, 8)
+        prng_seed = jax.random.split(prng_seed, num_samples)
         prompt_ids = shard(prompt_ids)
 
         images = p_sample(prompt_ids, params, prng_seed, num_inference_steps).images
 
-        assert images.shape == (8, 1, 512, 512, 3)
-        assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.06652832)) < 1e-3
-        assert np.abs((np.abs(images, dtype=np.float32).sum() - 2384849.8)) < 5e-1
+        assert images.shape == (num_samples, 1, 512, 512, 3)
+        if jax.device_count() == 8:
+            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.06652832)) < 1e-3
+            assert np.abs((np.abs(images, dtype=np.float32).sum() - 2384849.8)) < 5e-1
 
     def test_stable_diffusion_v1_4_bfloat_16_with_safety(self):
         pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
@@ -165,14 +168,15 @@ def test_stable_diffusion_v1_4_bfloat_16_with_safety(self):
 
         # shard inputs and rng
         params = replicate(params)
-        prng_seed = jax.random.split(prng_seed, 8)
+        prng_seed = jax.random.split(prng_seed, num_samples)
         prompt_ids = shard(prompt_ids)
 
         images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
 
-        assert images.shape == (8, 1, 512, 512, 3)
-        assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.06652832)) < 1e-3
-        assert np.abs((np.abs(images, dtype=np.float32).sum() - 2384849.8)) < 5e-1
+        assert images.shape == (num_samples, 1, 512, 512, 3)
+        if jax.device_count() == 8:
+            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.06652832)) < 1e-3
+            assert np.abs((np.abs(images, dtype=np.float32).sum() - 2384849.8)) < 5e-1
 
     def test_stable_diffusion_v1_4_bfloat_16_ddim(self):
         scheduler = FlaxDDIMScheduler(
@@ -210,11 +214,12 @@ def test_stable_diffusion_v1_4_bfloat_16_ddim(self):
 
         # shard inputs and rng
         params = replicate(params)
-        prng_seed = jax.random.split(prng_seed, 8)
+        prng_seed = jax.random.split(prng_seed, num_samples)
         prompt_ids = shard(prompt_ids)
 
         images = p_sample(prompt_ids, params, prng_seed, num_inference_steps).images
 
-        assert images.shape == (8, 1, 512, 512, 3)
-        assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.045043945)) < 1e-3
-        assert np.abs((np.abs(images, dtype=np.float32).sum() - 2347693.5)) < 5e-1
+        assert images.shape == (num_samples, 1, 512, 512, 3)
+        if jax.device_count() == 8:
+            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.045043945)) < 1e-3
+            assert np.abs((np.abs(images, dtype=np.float32).sum() - 2347693.5)) < 5e-1

From 13f388eeb29fc0fc64674031c042449622f5295d Mon Sep 17 00:00:00 2001
From: exo-pla-net <47930742+exo-pla-net@users.noreply.github.com>
Date: Wed, 9 Nov 2022 12:39:27 -0800
Subject: [PATCH 170/269] Improve documentation for the LPW pipeline (#1182)

---
 examples/community/README.md | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/examples/community/README.md b/examples/community/README.md
index b83b2ff4d45e..fd6fff79c576 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -179,9 +179,20 @@ images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_imag
 As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.
 
 ### Long Prompt Weighting Stable Diffusion
-
-The Pipeline lets you input prompt without 77 token length limit. And you can increase words weighting by using "()" or decrease words weighting by using "[]"
-The Pipeline also lets you use the main use cases of the stable diffusion pipeline in a single class.
+Features of this custom pipeline:
+- Input a prompt without the 77 token length limit.
+- Includes tx2img, img2img. and inpainting pipelines.
+- Emphasize/weigh part of your prompt with parentheses as so: `a baby deer with (big eyes)`
+- De-emphasize part of your prompt as so: `a [baby] deer with big eyes`
+- Precisely weigh part of your prompt as so: `a baby deer with (big eyes:1.3)`
+
+Prompt weighting equivalents:
+- `a baby deer with` == `(a baby deer with:1.0)`
+- `(big eyes)` == `(big eyes:1.1)`
+- `((big eyes))` == `(big eyes:1.21)`
+- `[big eyes]` == `(big eyes:0.91)`
+
+You can run this custom pipeline as so:
 
 #### pytorch
 

From 3d98dc763a645423f992b3546c4d9cdd97eb9d04 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 9 Nov 2022 22:18:57 +0100
Subject: [PATCH 171/269] Factor out encode text with Copied from (#1224)

* up

* more fixes

* fix

* finalize

* Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py

* upload models

* up
---
 .../pipeline_cycle_diffusion.py               | 186 ++++++++++--------
 .../pipeline_onnx_stable_diffusion.py         | 108 ++++++----
 .../pipeline_onnx_stable_diffusion_img2img.py | 134 +++++++------
 .../pipeline_onnx_stable_diffusion_inpaint.py | 138 +++++++------
 .../pipeline_stable_diffusion.py              | 153 ++++++++------
 .../pipeline_stable_diffusion_img2img.py      | 146 ++++++++------
 .../pipeline_stable_diffusion_inpaint.py      | 150 ++++++++------
 ...ipeline_stable_diffusion_inpaint_legacy.py | 175 ++++++++++------
 8 files changed, 710 insertions(+), 480 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index 631ceda813ee..50f519c77f52 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -205,6 +205,110 @@ def disable_attention_slicing(self):
         # set slice_size = `None` to disable `set_attention_slice`
         self.enable_attention_slicing(None)
 
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
     @torch.no_grad()
     def __call__(
         self,
@@ -309,89 +413,17 @@ def __call__(
         if isinstance(init_image, PIL.Image.Image):
             init_image = preprocess(init_image)
 
-        # get prompt text embeddings
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="pt",
-        )
-        source_text_inputs = self.tokenizer(
-            source_prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="pt",
-        )
-        text_input_ids = text_inputs.input_ids
-        source_text_input_ids = source_text_inputs.input_ids
-
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-        if source_text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(source_text_input_ids[:, self.tokenizer.model_max_length :])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-            source_text_input_ids = source_text_input_ids[:, : self.tokenizer.model_max_length]
-        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
-        source_text_embeddings = self.text_encoder(source_text_input_ids.to(self.device))[0]
-
-        # duplicate text embeddings for each generation per prompt
-        text_embeddings = text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
-        source_text_embeddings = source_text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+        device = self._execution_device
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
 
-        # get unconditional embeddings for classifier free guidance
-        uncond_tokens = [""]
-
-        max_length = text_input_ids.shape[-1]
-        uncond_input = self.tokenizer(
-            uncond_tokens,
-            padding="max_length",
-            max_length=max_length,
-            truncation=True,
-            return_tensors="pt",
+        text_embeddings = self._encode_prompt(prompt, device, num_images_per_prompt, do_classifier_free_guidance, None)
+        source_text_embeddings = self._encode_prompt(
+            source_prompt, device, num_images_per_prompt, do_classifier_free_guidance, None
         )
-        uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
-
-        # duplicate unconditional embeddings for each generation per prompt
-        uncond_embeddings = uncond_embeddings.repeat_interleave(batch_size * num_images_per_prompt, dim=0)
-
-        # For classifier free guidance, we need to do two forward passes.
-        # Here we concatenate the unconditional and text embeddings into a single batch
-        # to avoid doing two forward passes
-        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-        source_uncond_tokens = [""]
-
-        max_length = source_text_input_ids.shape[-1]
-        source_uncond_input = self.tokenizer(
-            source_uncond_tokens,
-            padding="max_length",
-            max_length=max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        source_uncond_embeddings = self.text_encoder(source_uncond_input.input_ids.to(self.device))[0]
-
-        # duplicate unconditional embeddings for each generation per prompt
-        source_uncond_embeddings = source_uncond_embeddings.repeat_interleave(
-            batch_size * num_images_per_prompt, dim=0
-        )
-
-        # For classifier free guidance, we need to do two forward passes.
-        # Here we concatenate the unconditional and text embeddings into a single batch
-        # to avoid doing two forward passes
-        source_text_embeddings = torch.cat([source_uncond_embeddings, source_text_embeddings])
 
         # encode the init image into latents and scale the latents
         latents_dtype = text_embeddings.dtype
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
index 3b4689086f27..d1e2704fceeb 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
@@ -92,44 +92,22 @@ def __init__(
             feature_extractor=feature_extractor,
         )
 
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        height: Optional[int] = 512,
-        width: Optional[int] = 512,
-        num_inference_steps: Optional[int] = 50,
-        guidance_scale: Optional[float] = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: Optional[float] = 0.0,
-        generator: Optional[np.random.RandomState] = None,
-        latents: Optional[np.ndarray] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-        callback_steps: Optional[int] = 1,
-        **kwargs,
-    ):
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if generator is None:
-            generator = np.random
+    def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
 
         # get prompt text embeddings
         text_inputs = self.tokenizer(
@@ -150,10 +128,6 @@ def __call__(
         text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
         text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0)
 
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
         # get unconditional embeddings for classifier free guidance
         if do_classifier_free_guidance:
             uncond_tokens: List[str]
@@ -191,6 +165,56 @@ def __call__(
             # to avoid doing two forward passes
             text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
 
+        return text_embeddings
+
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        latents: Optional[np.ndarray] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if generator is None:
+            generator = np.random
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        text_embeddings = self._encode_prompt(
+            prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
         # get the initial random noise unless the user supplied it
         latents_dtype = text_embeddings.dtype
         latents_shape = (batch_size * num_images_per_prompt, 4, height // 8, width // 8)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
index f167acf23357..a09dfe751f4c 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -138,6 +138,82 @@ def __init__(
             feature_extractor=feature_extractor,
         )
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline._encode_prompt
+    def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="np",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+        text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            uncond_embeddings = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+            uncond_embeddings = np.repeat(uncond_embeddings, num_images_per_prompt, axis=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
     def __call__(
         self,
         prompt: Union[str, List[str]],
@@ -236,66 +312,14 @@ def __call__(
         if isinstance(init_image, PIL.Image.Image):
             init_image = preprocess(init_image)
 
-        # get prompt text embeddings
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="np",
-        )
-        text_input_ids = text_inputs.input_ids
-
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-        text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
-
-        # duplicate text embeddings for each generation per prompt
-        text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0)
-
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""]
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt] * batch_size
-            elif batch_size != len(negative_prompt):
-                raise ValueError("The length of `negative_prompt` should be equal to batch_size.")
-            else:
-                uncond_tokens = negative_prompt
 
-            max_length = text_input_ids.shape[-1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="np",
-            )
-            uncond_input_ids = uncond_input.input_ids
-            uncond_embeddings = self.text_encoder(input_ids=uncond_input_ids.astype(np.int32))[0]
-
-            # duplicate unconditional embeddings for each generation per prompt
-            uncond_embeddings = np.repeat(uncond_embeddings, num_images_per_prompt, axis=0)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
+        text_embeddings = self._encode_prompt(
+            prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
 
         latents_dtype = text_embeddings.dtype
         init_image = init_image.astype(latents_dtype)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
index 3aa1cc829956..6c226dd432f5 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -152,6 +152,82 @@ def __init__(
             feature_extractor=feature_extractor,
         )
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline._encode_prompt
+    def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="np",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+        text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            uncond_embeddings = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+            uncond_embeddings = np.repeat(uncond_embeddings, num_images_per_prompt, axis=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
     @torch.no_grad()
     def __call__(
         self,
@@ -258,70 +334,14 @@ def __call__(
         # set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
 
-        # get prompt text embeddings
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="np",
-        )
-        text_input_ids = text_inputs.input_ids
-
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-        text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
-
-        # duplicate text embeddings for each generation per prompt
-        text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0)
-
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""]
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt] * batch_size
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
 
-            max_length = text_input_ids.shape[-1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="np",
-            )
-            uncond_input_ids = uncond_input.input_ids
-            uncond_embeddings = self.text_encoder(input_ids=uncond_input_ids.astype(np.int32))[0]
-
-            # duplicate unconditional embeddings for each generation per prompt
-            uncond_embeddings = np.repeat(uncond_embeddings, num_images_per_prompt, axis=0)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
+        text_embeddings = self._encode_prompt(
+            prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
 
         num_channels_latents = NUM_LATENT_CHANNELS
         latents_shape = (batch_size * num_images_per_prompt, num_channels_latents, height // 8, width // 8)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index c64db5266642..ed5246fa91b6 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -213,6 +213,90 @@ def _execution_device(self):
                 return torch.device(module._hf_hook.execution_device)
         return self.device
 
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
     @torch.no_grad()
     def __call__(
         self,
@@ -306,79 +390,20 @@ def __call__(
 
         device = self._execution_device
 
-        # get prompt text embeddings
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="pt",
-        )
-        text_input_ids = text_inputs.input_ids
-
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-        text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
-
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = text_embeddings.shape
-        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
-        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = text_input_ids.shape[-1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
 
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
-            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-
-        # get the initial random noise unless the user supplied it
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
 
         # Unlike in other pipelines, latents need to be generated in the target device
         # for 1-to-1 results reproducibility with the CompVis implementation.
         # However this currently doesn't work in `mps`.
+
+        # get the initial random noise unless the user supplied it
         latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
         latents_dtype = text_embeddings.dtype
         if latents is None:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 6bf2babbfef1..b4ecf600b10e 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -220,6 +220,91 @@ def disable_xformers_memory_efficient_attention(self):
         """
         self.unet.set_use_memory_efficient_attention_xformers(False)
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
     @torch.no_grad()
     def __call__(
         self,
@@ -319,69 +404,14 @@ def __call__(
         if isinstance(init_image, PIL.Image.Image):
             init_image = preprocess(init_image)
 
-        # get prompt text embeddings
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="pt",
-        )
-        text_input_ids = text_inputs.input_ids
-
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-        text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
-
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = text_embeddings.shape
-        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
-        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError("The length of `negative_prompt` should be equal to batch_size.")
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = text_input_ids.shape[-1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
 
-            # duplicate unconditional embeddings for each generation per prompt
-            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
-            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
 
         # encode the init image into latents and scale the latents
         latents_dtype = text_embeddings.dtype
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 5ccdd07bb384..f42325261a01 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -220,6 +220,91 @@ def disable_xformers_memory_efficient_attention(self):
         """
         self.unet.set_use_memory_efficient_attention_xformers(False)
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
     @torch.no_grad()
     def __call__(
         self,
@@ -324,73 +409,14 @@ def __call__(
 
         device = self._execution_device
 
-        # get prompt text embeddings
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="pt",
-        )
-        text_input_ids = text_inputs.input_ids
-
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-        text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
-
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        bs_embed, seq_len, _ = text_embeddings.shape
-        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
-        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
 
-            max_length = text_input_ids.shape[-1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
-
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
-            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
 
         # get the initial random noise unless the user supplied it
         # Unlike in other pipelines, latents need to be generated in the target device
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 80d087952640..89d40f7a79ae 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -170,6 +170,110 @@ def disable_attention_slicing(self):
         # set slice_size = `None` to disable `set_attention_slice`
         self.enable_attention_slicing(None)
 
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
     @torch.no_grad()
     def __call__(
         self,
@@ -266,78 +370,23 @@ def __call__(
                 f" {type(callback_steps)}."
             )
 
+        device = self._execution_device
+
         # set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
 
-        # get prompt text embeddings
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="pt",
-        )
-        text_input_ids = text_inputs.input_ids
-
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
-
-        # duplicate text embeddings for each generation per prompt
-        text_embeddings = text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+        # preprocess image
+        if not isinstance(init_image, torch.FloatTensor):
+            init_image = preprocess_image(init_image)
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = text_input_ids.shape[-1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
-
-            # duplicate unconditional embeddings for each generation per prompt
-            seq_len = uncond_embeddings.shape[1]
-            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
-            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
 
-        # preprocess image
-        if not isinstance(init_image, torch.FloatTensor):
-            init_image = preprocess_image(init_image)
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
 
         # encode the init image into latents and scale the latents
         latents_dtype = text_embeddings.dtype

From 7d0c2729399c3ce019a30fc175b973e892fd5fc3 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Wed, 9 Nov 2022 23:00:23 +0100
Subject: [PATCH 172/269] Match the generator device to the pipeline for DDPM
 and DDIM (#1222)

* Match the generator device to the pipeline for DDPM and DDIM

* style

* fix

* update values

* fix fast tests

* trigger slow tests

* deprecate

* last value fixes

* mps fixes
---
 .../train_unconditional.py                    | 11 ++-
 src/diffusers/pipelines/ddim/pipeline_ddim.py | 29 ++++---
 src/diffusers/pipelines/ddpm/pipeline_ddpm.py | 15 +++-
 src/diffusers/schedulers/scheduling_ddpm.py   | 14 +++-
 tests/pipelines/ddim/test_ddim.py             | 28 +++----
 tests/pipelines/ddpm/test_ddpm.py             | 28 +++----
 .../test_latent_diffusion_superresolution.py  | 17 ++--
 tests/test_pipelines.py                       | 83 ++++++++-----------
 8 files changed, 115 insertions(+), 110 deletions(-)

diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py
index 0eadecbd3095..54a94d98b578 100644
--- a/examples/unconditional_image_generation/train_unconditional.py
+++ b/examples/unconditional_image_generation/train_unconditional.py
@@ -11,10 +11,12 @@
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from datasets import load_dataset
-from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
+from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel, __version__
 from diffusers.optimization import get_scheduler
 from diffusers.training_utils import EMAModel
+from diffusers.utils import deprecate
 from huggingface_hub import HfFolder, Repository, whoami
+from packaging import version
 from torchvision.transforms import (
     CenterCrop,
     Compose,
@@ -28,6 +30,7 @@
 
 
 logger = get_logger(__name__)
+diffusers_version = version.parse(version.parse(__version__).base_version)
 
 
 def _extract_into_tensor(arr, timesteps, broadcast_shape):
@@ -406,7 +409,11 @@ def transforms(examples):
                     scheduler=noise_scheduler,
                 )
 
-                generator = torch.manual_seed(0)
+                deprecate("todo: remove this check", "0.10.0", "when the most used version is >= 0.8.0")
+                if diffusers_version < version.parse("0.8.0"):
+                    generator = torch.manual_seed(0)
+                else:
+                    generator = torch.Generator(device=pipeline.device).manual_seed(0)
                 # run pipeline in inference (sample random noise and denoise)
                 images = pipeline(
                     generator=generator,
diff --git a/src/diffusers/pipelines/ddim/pipeline_ddim.py b/src/diffusers/pipelines/ddim/pipeline_ddim.py
index aab6e68613c4..d0bca8038ec4 100644
--- a/src/diffusers/pipelines/ddim/pipeline_ddim.py
+++ b/src/diffusers/pipelines/ddim/pipeline_ddim.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import inspect
 from typing import Optional, Tuple, Union
 
 import torch
 
 from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ...utils import deprecate
 
 
 class DDIMPipeline(DiffusionPipeline):
@@ -75,24 +75,29 @@ def __call__(
             generated images.
         """
 
+        if generator is not None and generator.device.type != self.device.type and self.device.type != "mps":
+            message = (
+                f"The `generator` device is `{generator.device}` and does not match the pipeline "
+                f"device `{self.device}`, so the `generator` will be set to `None`. "
+                f'Please use `generator=torch.Generator(device="{self.device}")` instead.'
+            )
+            deprecate(
+                "generator.device == 'cpu'",
+                "0.11.0",
+                message,
+            )
+            generator = None
+
         # Sample gaussian noise to begin loop
         image = torch.randn(
             (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
             generator=generator,
+            device=self.device,
         )
-        image = image.to(self.device)
 
         # set step values
         self.scheduler.set_timesteps(num_inference_steps)
 
-        # Ignore use_clipped_model_output if the scheduler doesn't accept this argument
-        accepts_use_clipped_model_output = "use_clipped_model_output" in set(
-            inspect.signature(self.scheduler.step).parameters.keys()
-        )
-        extra_kwargs = {}
-        if accepts_use_clipped_model_output:
-            extra_kwargs["use_clipped_model_output"] = use_clipped_model_output
-
         for t in self.progress_bar(self.scheduler.timesteps):
             # 1. predict noise model_output
             model_output = self.unet(image, t).sample
@@ -100,7 +105,9 @@ def __call__(
             # 2. predict previous mean of image x_t-1 and add variance depending on eta
             # eta corresponds to η in paper and should be between [0, 1]
             # do x_t -> x_t-1
-            image = self.scheduler.step(model_output, t, image, eta, **extra_kwargs).prev_sample
+            image = self.scheduler.step(
+                model_output, t, image, eta=eta, use_clipped_model_output=use_clipped_model_output, generator=generator
+            ).prev_sample
 
         image = (image / 2 + 0.5).clamp(0, 1)
         image = image.cpu().permute(0, 2, 3, 1).numpy()
diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
index 3665c68efe1d..d145c5d518a1 100644
--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -80,12 +80,25 @@ def __call__(
             new_config["predict_epsilon"] = predict_epsilon
             self.scheduler._internal_dict = FrozenDict(new_config)
 
+        if generator is not None and generator.device.type != self.device.type and self.device.type != "mps":
+            message = (
+                f"The `generator` device is `{generator.device}` and does not match the pipeline "
+                f"device `{self.device}`, so the `generator` will be set to `None`. "
+                f'Please use `torch.Generator(device="{self.device}")` instead.'
+            )
+            deprecate(
+                "generator.device == 'cpu'",
+                "0.11.0",
+                message,
+            )
+            generator = None
+
         # Sample gaussian noise to begin loop
         image = torch.randn(
             (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
             generator=generator,
+            device=self.device,
         )
-        image = image.to(self.device)
 
         # set step values
         self.scheduler.set_timesteps(num_inference_steps)
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index 08a73119e5b7..a19d91879cc3 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -292,10 +292,16 @@ def step(
         # 6. Add noise
         variance = 0
         if t > 0:
-            noise = torch.randn(
-                model_output.size(), dtype=model_output.dtype, layout=model_output.layout, generator=generator
-            ).to(model_output.device)
-            variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * noise
+            device = model_output.device
+            if device.type == "mps":
+                # randn does not work reproducibly on mps
+                variance_noise = torch.randn(model_output.shape, dtype=model_output.dtype, generator=generator)
+                variance_noise = variance_noise.to(device)
+            else:
+                variance_noise = torch.randn(
+                    model_output.shape, generator=generator, device=device, dtype=model_output.dtype
+                )
+            variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * variance_noise
 
         pred_prev_sample = pred_prev_sample + variance
 
diff --git a/tests/pipelines/ddim/test_ddim.py b/tests/pipelines/ddim/test_ddim.py
index 4445fe7feecf..81c49912be5c 100644
--- a/tests/pipelines/ddim/test_ddim.py
+++ b/tests/pipelines/ddim/test_ddim.py
@@ -19,7 +19,7 @@
 import torch
 
 from diffusers import DDIMPipeline, DDIMScheduler, UNet2DModel
-from diffusers.utils.testing_utils import require_torch, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu, slow, torch_device
 
 from ...test_pipelines_common import PipelineTesterMixin
 
@@ -43,21 +43,18 @@ def dummy_uncond_unet(self):
         return model
 
     def test_inference(self):
+        device = "cpu"
         unet = self.dummy_uncond_unet
         scheduler = DDIMScheduler()
 
         ddpm = DDIMPipeline(unet=unet, scheduler=scheduler)
-        ddpm.to(torch_device)
+        ddpm.to(device)
         ddpm.set_progress_bar_config(disable=None)
 
-        # Warmup pass when using mps (see #372)
-        if torch_device == "mps":
-            _ = ddpm(num_inference_steps=1)
-
-        generator = torch.manual_seed(0)
+        generator = torch.Generator(device=device).manual_seed(0)
         image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images
 
-        generator = torch.manual_seed(0)
+        generator = torch.Generator(device=device).manual_seed(0)
         image_from_tuple = ddpm(generator=generator, num_inference_steps=2, output_type="numpy", return_dict=False)[0]
 
         image_slice = image[0, -3:, -3:, -1]
@@ -67,13 +64,12 @@ def test_inference(self):
         expected_slice = np.array(
             [1.000e00, 5.717e-01, 4.717e-01, 1.000e00, 0.000e00, 1.000e00, 3.000e-04, 0.000e00, 9.000e-04]
         )
-        tolerance = 1e-2 if torch_device != "mps" else 3e-2
-        assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < tolerance
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
 
 @slow
-@require_torch
+@require_torch_gpu
 class DDIMPipelineIntegrationTests(unittest.TestCase):
     def test_inference_ema_bedroom(self):
         model_id = "google/ddpm-ema-bedroom-256"
@@ -85,13 +81,13 @@ def test_inference_ema_bedroom(self):
         ddpm.to(torch_device)
         ddpm.set_progress_bar_config(disable=None)
 
-        generator = torch.manual_seed(0)
+        generator = torch.Generator(device=torch_device).manual_seed(0)
         image = ddpm(generator=generator, output_type="numpy").images
 
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.00605, 0.0201, 0.0344, 0.00235, 0.00185, 0.00025, 0.00215, 0.0, 0.00685])
+        expected_slice = np.array([0.1546, 0.1561, 0.1595, 0.1564, 0.1569, 0.1585, 0.1554, 0.1550, 0.1575])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
     def test_inference_cifar10(self):
@@ -104,11 +100,11 @@ def test_inference_cifar10(self):
         ddim.to(torch_device)
         ddim.set_progress_bar_config(disable=None)
 
-        generator = torch.manual_seed(0)
+        generator = torch.Generator(device=torch_device).manual_seed(0)
         image = ddim(generator=generator, eta=0.0, output_type="numpy").images
 
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.17235, 0.16175, 0.16005, 0.16255, 0.1497, 0.1513, 0.15045, 0.1442, 0.1453])
+        expected_slice = np.array([0.2060, 0.2042, 0.2022, 0.2193, 0.2146, 0.2110, 0.2471, 0.2446, 0.2388])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/ddpm/test_ddpm.py b/tests/pipelines/ddpm/test_ddpm.py
index a09f77d12467..e16e0d6e8cbd 100644
--- a/tests/pipelines/ddpm/test_ddpm.py
+++ b/tests/pipelines/ddpm/test_ddpm.py
@@ -20,7 +20,7 @@
 
 from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
 from diffusers.utils import deprecate
-from diffusers.utils.testing_utils import require_torch, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu, slow, torch_device
 
 from ...test_pipelines_common import PipelineTesterMixin
 
@@ -44,21 +44,18 @@ def dummy_uncond_unet(self):
         return model
 
     def test_inference(self):
+        device = "cpu"
         unet = self.dummy_uncond_unet
         scheduler = DDPMScheduler()
 
         ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
-        ddpm.to(torch_device)
+        ddpm.to(device)
         ddpm.set_progress_bar_config(disable=None)
 
-        # Warmup pass when using mps (see #372)
-        if torch_device == "mps":
-            _ = ddpm(num_inference_steps=1)
-
-        generator = torch.manual_seed(0)
+        generator = torch.Generator(device=device).manual_seed(0)
         image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images
 
-        generator = torch.manual_seed(0)
+        generator = torch.Generator(device=device).manual_seed(0)
         image_from_tuple = ddpm(generator=generator, num_inference_steps=2, output_type="numpy", return_dict=False)[0]
 
         image_slice = image[0, -3:, -3:, -1]
@@ -68,9 +65,8 @@ def test_inference(self):
         expected_slice = np.array(
             [5.589e-01, 7.089e-01, 2.632e-01, 6.841e-01, 1.000e-04, 9.999e-01, 1.973e-01, 1.000e-04, 8.010e-02]
         )
-        tolerance = 1e-2 if torch_device != "mps" else 3e-2
-        assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < tolerance
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
     def test_inference_predict_epsilon(self):
         deprecate("remove this test", "0.10.0", "remove")
@@ -85,10 +81,10 @@ def test_inference_predict_epsilon(self):
         if torch_device == "mps":
             _ = ddpm(num_inference_steps=1)
 
-        generator = torch.manual_seed(0)
+        generator = torch.Generator(device=torch_device).manual_seed(0)
         image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images
 
-        generator = torch.manual_seed(0)
+        generator = torch.Generator(device=torch_device).manual_seed(0)
         image_eps = ddpm(generator=generator, num_inference_steps=2, output_type="numpy", predict_epsilon=False)[0]
 
         image_slice = image[0, -3:, -3:, -1]
@@ -100,7 +96,7 @@ def test_inference_predict_epsilon(self):
 
 
 @slow
-@require_torch
+@require_torch_gpu
 class DDPMPipelineIntegrationTests(unittest.TestCase):
     def test_inference_cifar10(self):
         model_id = "google/ddpm-cifar10-32"
@@ -112,11 +108,11 @@ def test_inference_cifar10(self):
         ddpm.to(torch_device)
         ddpm.set_progress_bar_config(disable=None)
 
-        generator = torch.manual_seed(0)
+        generator = torch.Generator(device=torch_device).manual_seed(0)
         image = ddpm(generator=generator, output_type="numpy").images
 
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.41995, 0.35885, 0.19385, 0.38475, 0.3382, 0.2647, 0.41545, 0.3582, 0.33845])
+        expected_slice = np.array([0.4454, 0.2025, 0.0315, 0.3023, 0.2575, 0.1031, 0.0953, 0.1604, 0.2020])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
index f5ec56d1bdc2..f402d2f2a70e 100644
--- a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
+++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
@@ -68,30 +68,25 @@ def dummy_vq_model(self):
         return model
 
     def test_inference_superresolution(self):
+        device = "cpu"
         unet = self.dummy_uncond_unet
         scheduler = DDIMScheduler()
         vqvae = self.dummy_vq_model
 
         ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler)
-        ldm.to(torch_device)
+        ldm.to(device)
         ldm.set_progress_bar_config(disable=None)
 
-        init_image = self.dummy_image.to(torch_device)
-
-        # Warmup pass when using mps (see #372)
-        if torch_device == "mps":
-            generator = torch.manual_seed(0)
-            _ = ldm(init_image, generator=generator, num_inference_steps=1, output_type="numpy").images
+        init_image = self.dummy_image.to(device)
 
-        generator = torch.manual_seed(0)
+        generator = torch.Generator(device=device).manual_seed(0)
         image = ldm(init_image, generator=generator, num_inference_steps=2, output_type="numpy").images
 
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.8634, 0.8186, 0.6416, 0.6846, 0.4427, 0.5676, 0.4679, 0.6247, 0.5176])
-        tolerance = 1e-2 if torch_device != "mps" else 3e-2
-        assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance
+        expected_slice = np.array([0.8678, 0.8245, 0.6381, 0.6830, 0.4385, 0.5599, 0.4641, 0.6201, 0.5150])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
 
 @slow
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index da5bd3c24417..775ab689bdcb 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -42,7 +42,6 @@
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from diffusers.utils import CONFIG_NAME, WEIGHTS_NAME, floats_tensor, slow, torch_device
 from diffusers.utils.testing_utils import CaptureLogger, get_tests_dir, require_torch_gpu
-from parameterized import parameterized
 from PIL import Image
 from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
@@ -93,11 +92,17 @@ def test_download_no_safety_checker(self):
         pipe = StableDiffusionPipeline.from_pretrained(
             "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
         )
-        generator = torch.Generator(device=torch_device).manual_seed(0)
+        pipe = pipe.to(torch_device)
+        if torch_device == "mps":
+            # device type MPS is not supported for torch.Generator() api.
+            generator = torch.manual_seed(0)
+        else:
+            generator = torch.Generator(device=torch_device).manual_seed(0)
         out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
 
         pipe_2 = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
-        generator_2 = torch.Generator(device=torch_device).manual_seed(0)
+        pipe_2 = pipe_2.to(torch_device)
+        generator_2 = generator.manual_seed(0)
         out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator_2, output_type="numpy").images
 
         assert np.max(np.abs(out - out_2)) < 1e-3
@@ -107,13 +112,19 @@ def test_load_no_safety_checker_explicit_locally(self):
         pipe = StableDiffusionPipeline.from_pretrained(
             "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
         )
-        generator = torch.Generator(device=torch_device).manual_seed(0)
+        pipe = pipe.to(torch_device)
+        if torch_device == "mps":
+            # device type MPS is not supported for torch.Generator() api.
+            generator = torch.manual_seed(0)
+        else:
+            generator = torch.Generator(device=torch_device).manual_seed(0)
         out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             pipe.save_pretrained(tmpdirname)
             pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname, safety_checker=None)
-            generator_2 = torch.Generator(device=torch_device).manual_seed(0)
+            pipe_2 = pipe_2.to(torch_device)
+            generator_2 = generator.manual_seed(0)
             out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator_2, output_type="numpy").images
 
         assert np.max(np.abs(out - out_2)) < 1e-3
@@ -121,13 +132,19 @@ def test_load_no_safety_checker_explicit_locally(self):
     def test_load_no_safety_checker_default_locally(self):
         prompt = "hello"
         pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
-        generator = torch.Generator(device=torch_device).manual_seed(0)
+        pipe = pipe.to(torch_device)
+        if torch_device == "mps":
+            # device type MPS is not supported for torch.Generator() api.
+            generator = torch.manual_seed(0)
+        else:
+            generator = torch.Generator(device=torch_device).manual_seed(0)
         out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             pipe.save_pretrained(tmpdirname)
             pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname)
-            generator_2 = torch.Generator(device=torch_device).manual_seed(0)
+            pipe_2 = pipe_2.to(torch_device)
+            generator_2 = generator.manual_seed(0)
             out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator_2, output_type="numpy").images
 
         assert np.max(np.abs(out - out_2)) < 1e-3
@@ -431,7 +448,7 @@ def test_from_pretrained_save_pretrained(self):
             new_ddpm = DDPMPipeline.from_pretrained(tmpdirname)
             new_ddpm.to(torch_device)
 
-        generator = torch.manual_seed(0)
+        generator = torch.Generator(device=torch_device).manual_seed(0)
         image = ddpm(generator=generator, output_type="numpy").images
 
         generator = generator.manual_seed(0)
@@ -452,7 +469,7 @@ def test_from_pretrained_hub(self):
         ddpm_from_hub = ddpm_from_hub.to(torch_device)
         ddpm_from_hub.set_progress_bar_config(disable=None)
 
-        generator = torch.manual_seed(0)
+        generator = torch.Generator(device=torch_device).manual_seed(0)
         image = ddpm(generator=generator, output_type="numpy").images
 
         generator = generator.manual_seed(0)
@@ -475,7 +492,7 @@ def test_from_pretrained_hub_pass_model(self):
         ddpm_from_hub = ddpm_from_hub.to(torch_device)
         ddpm_from_hub_custom_model.set_progress_bar_config(disable=None)
 
-        generator = torch.manual_seed(0)
+        generator = torch.Generator(device=torch_device).manual_seed(0)
         image = ddpm_from_hub_custom_model(generator=generator, output_type="numpy").images
 
         generator = generator.manual_seed(0)
@@ -491,7 +508,7 @@ def test_output_format(self):
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
 
-        generator = torch.manual_seed(0)
+        generator = torch.Generator(device=torch_device).manual_seed(0)
         images = pipe(generator=generator, output_type="numpy").images
         assert images.shape == (1, 32, 32, 3)
         assert isinstance(images, np.ndarray)
@@ -506,40 +523,8 @@ def test_output_format(self):
         assert isinstance(images, list)
         assert isinstance(images[0], PIL.Image.Image)
 
-    # Make sure the test passes for different values of random seed
-    @parameterized.expand([(0,), (4,)])
-    def test_ddpm_ddim_equality(self, seed):
-        model_id = "google/ddpm-cifar10-32"
-
-        unet = UNet2DModel.from_pretrained(model_id)
-        ddpm_scheduler = DDPMScheduler()
-        ddim_scheduler = DDIMScheduler()
-
-        ddpm = DDPMPipeline(unet=unet, scheduler=ddpm_scheduler)
-        ddpm.to(torch_device)
-        ddpm.set_progress_bar_config(disable=None)
-        ddim = DDIMPipeline(unet=unet, scheduler=ddim_scheduler)
-        ddim.to(torch_device)
-        ddim.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(seed)
-        ddpm_image = ddpm(generator=generator, output_type="numpy").images
-
-        generator = torch.manual_seed(seed)
-        ddim_image = ddim(
-            generator=generator,
-            num_inference_steps=1000,
-            eta=1.0,
-            output_type="numpy",
-            use_clipped_model_output=True,  # Need this to make DDIM match DDPM
-        ).images
-
-        # the values aren't exactly equal, but the images look the same visually
-        assert np.abs(ddpm_image - ddim_image).max() < 1e-1
-
-    # Make sure the test passes for different values of random seed
-    @parameterized.expand([(0,), (4,)])
-    def test_ddpm_ddim_equality_batched(self, seed):
+    def test_ddpm_ddim_equality_batched(self):
+        seed = 0
         model_id = "google/ddpm-cifar10-32"
 
         unet = UNet2DModel.from_pretrained(model_id)
@@ -554,12 +539,12 @@ def test_ddpm_ddim_equality_batched(self, seed):
         ddim.to(torch_device)
         ddim.set_progress_bar_config(disable=None)
 
-        generator = torch.manual_seed(seed)
-        ddpm_images = ddpm(batch_size=4, generator=generator, output_type="numpy").images
+        generator = torch.Generator(device=torch_device).manual_seed(seed)
+        ddpm_images = ddpm(batch_size=2, generator=generator, output_type="numpy").images
 
-        generator = torch.manual_seed(seed)
+        generator = torch.Generator(device=torch_device).manual_seed(seed)
         ddim_images = ddim(
-            batch_size=4,
+            batch_size=2,
             generator=generator,
             num_inference_steps=1000,
             eta=1.0,

From 187de44352ce23acf00a9204a05a8a308aab7003 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 9 Nov 2022 22:18:14 +0000
Subject: [PATCH 173/269] Fix device on save/load tests

---
 tests/test_pipelines.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 775ab689bdcb..4559d713ed81 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -102,8 +102,12 @@ def test_download_no_safety_checker(self):
 
         pipe_2 = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
         pipe_2 = pipe_2.to(torch_device)
-        generator_2 = generator.manual_seed(0)
-        out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator_2, output_type="numpy").images
+        if torch_device == "mps":
+            # device type MPS is not supported for torch.Generator() api.
+            generator = torch.manual_seed(0)
+        else:
+            generator = torch.Generator(device=torch_device).manual_seed(0)
+        out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
 
         assert np.max(np.abs(out - out_2)) < 1e-3
 
@@ -124,8 +128,14 @@ def test_load_no_safety_checker_explicit_locally(self):
             pipe.save_pretrained(tmpdirname)
             pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname, safety_checker=None)
             pipe_2 = pipe_2.to(torch_device)
-            generator_2 = generator.manual_seed(0)
-            out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator_2, output_type="numpy").images
+
+            if torch_device == "mps":
+                # device type MPS is not supported for torch.Generator() api.
+                generator = torch.manual_seed(0)
+            else:
+                generator = torch.Generator(device=torch_device).manual_seed(0)
+
+            out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
 
         assert np.max(np.abs(out - out_2)) < 1e-3
 
@@ -144,8 +154,14 @@ def test_load_no_safety_checker_default_locally(self):
             pipe.save_pretrained(tmpdirname)
             pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname)
             pipe_2 = pipe_2.to(torch_device)
-            generator_2 = generator.manual_seed(0)
-            out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator_2, output_type="numpy").images
+
+            if torch_device == "mps":
+                # device type MPS is not supported for torch.Generator() api.
+                generator = torch.manual_seed(0)
+            else:
+                generator = torch.Generator(device=torch_device).manual_seed(0)
+
+            out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
 
         assert np.max(np.abs(out - out_2)) < 1e-3
 

From 0feb21a18c44cfbf76a916afead986f04b339292 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Thu, 10 Nov 2022 00:09:22 +0100
Subject: [PATCH 174/269] [Tests] Fix mps+generator fast tests (#1230)

* [Tests] Fix mps+generator fast tests

* mps for Euler

* retry

* warmup issue again?

* fix reproducible initial noise

* Revert "fix reproducible initial noise"

This reverts commit f300d05cb9782ed320064a0c58577a32d4139e6d.

* fix reproducible initial noise

* fix device
---
 .github/workflows/pr_tests.yml                |  2 +-
 src/diffusers/pipelines/ddim/pipeline_ddim.py | 14 +++++-----
 src/diffusers/pipelines/ddpm/pipeline_ddpm.py | 14 +++++-----
 tests/pipelines/ddpm/test_ddpm.py             |  8 ++++--
 tests/test_scheduler.py                       | 26 +++++++++++++++----
 5 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
index c978efe3b7db..dc1c482aa098 100644
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -136,7 +136,7 @@ jobs:
     - name: Run fast PyTorch tests on M1 (MPS)
       shell: arch -arch arm64 bash {0}
       run: |
-        ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps tests/
+        ${CONDA_RUN} python -m pytest -n 0 -s -v --make-reports=tests_torch_mps tests/
 
     - name: Failure short reports
       if: ${{ failure() }}
diff --git a/src/diffusers/pipelines/ddim/pipeline_ddim.py b/src/diffusers/pipelines/ddim/pipeline_ddim.py
index d0bca8038ec4..6db6298329a7 100644
--- a/src/diffusers/pipelines/ddim/pipeline_ddim.py
+++ b/src/diffusers/pipelines/ddim/pipeline_ddim.py
@@ -78,7 +78,7 @@ def __call__(
         if generator is not None and generator.device.type != self.device.type and self.device.type != "mps":
             message = (
                 f"The `generator` device is `{generator.device}` and does not match the pipeline "
-                f"device `{self.device}`, so the `generator` will be set to `None`. "
+                f"device `{self.device}`, so the `generator` will be ignored. "
                 f'Please use `generator=torch.Generator(device="{self.device}")` instead.'
             )
             deprecate(
@@ -89,11 +89,13 @@ def __call__(
             generator = None
 
         # Sample gaussian noise to begin loop
-        image = torch.randn(
-            (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
-            generator=generator,
-            device=self.device,
-        )
+        image_shape = (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size)
+        if self.device.type == "mps":
+            # randn does not work reproducibly on mps
+            image = torch.randn(image_shape, generator=generator)
+            image = image.to(self.device)
+        else:
+            image = torch.randn(image_shape, generator=generator, device=self.device)
 
         # set step values
         self.scheduler.set_timesteps(num_inference_steps)
diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
index d145c5d518a1..b7194664f4c4 100644
--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -83,7 +83,7 @@ def __call__(
         if generator is not None and generator.device.type != self.device.type and self.device.type != "mps":
             message = (
                 f"The `generator` device is `{generator.device}` and does not match the pipeline "
-                f"device `{self.device}`, so the `generator` will be set to `None`. "
+                f"device `{self.device}`, so the `generator` will be ignored. "
                 f'Please use `torch.Generator(device="{self.device}")` instead.'
             )
             deprecate(
@@ -94,11 +94,13 @@ def __call__(
             generator = None
 
         # Sample gaussian noise to begin loop
-        image = torch.randn(
-            (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
-            generator=generator,
-            device=self.device,
-        )
+        image_shape = (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size)
+        if self.device.type == "mps":
+            # randn does not work reproducibly on mps
+            image = torch.randn(image_shape, generator=generator)
+            image = image.to(self.device)
+        else:
+            image = torch.randn(image_shape, generator=generator, device=self.device)
 
         # set step values
         self.scheduler.set_timesteps(num_inference_steps)
diff --git a/tests/pipelines/ddpm/test_ddpm.py b/tests/pipelines/ddpm/test_ddpm.py
index e16e0d6e8cbd..14bc09469773 100644
--- a/tests/pipelines/ddpm/test_ddpm.py
+++ b/tests/pipelines/ddpm/test_ddpm.py
@@ -81,10 +81,14 @@ def test_inference_predict_epsilon(self):
         if torch_device == "mps":
             _ = ddpm(num_inference_steps=1)
 
-        generator = torch.Generator(device=torch_device).manual_seed(0)
+        if torch_device == "mps":
+            # device type MPS is not supported for torch.Generator() api.
+            generator = torch.manual_seed(0)
+        else:
+            generator = torch.Generator(device=torch_device).manual_seed(0)
         image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images
 
-        generator = torch.Generator(device=torch_device).manual_seed(0)
+        generator = generator.manual_seed(0)
         image_eps = ddpm(generator=generator, num_inference_steps=2, output_type="numpy", predict_epsilon=False)[0]
 
         image_slice = image[0, -3:, -3:, -1]
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index ab5217151125..a9770f0a54a8 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -1281,7 +1281,11 @@ def test_full_loop_no_noise(self):
 
         scheduler.set_timesteps(self.num_inference_steps)
 
-        generator = torch.Generator(torch_device).manual_seed(0)
+        if torch_device == "mps":
+            # device type MPS is not supported for torch.Generator() api.
+            generator = torch.manual_seed(0)
+        else:
+            generator = torch.Generator(device=torch_device).manual_seed(0)
 
         model = self.dummy_model()
         sample = self.dummy_sample_deter * scheduler.init_noise_sigma
@@ -1308,7 +1312,11 @@ def test_full_loop_device(self):
 
         scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
 
-        generator = torch.Generator(torch_device).manual_seed(0)
+        if torch_device == "mps":
+            # device type MPS is not supported for torch.Generator() api.
+            generator = torch.manual_seed(0)
+        else:
+            generator = torch.Generator(device=torch_device).manual_seed(0)
 
         model = self.dummy_model()
         sample = self.dummy_sample_deter * scheduler.init_noise_sigma
@@ -1364,7 +1372,11 @@ def test_full_loop_no_noise(self):
 
         scheduler.set_timesteps(self.num_inference_steps)
 
-        generator = torch.Generator(device=torch_device).manual_seed(0)
+        if torch_device == "mps":
+            # device type MPS is not supported for torch.Generator() api.
+            generator = torch.manual_seed(0)
+        else:
+            generator = torch.Generator(device=torch_device).manual_seed(0)
 
         model = self.dummy_model()
         sample = self.dummy_sample_deter * scheduler.init_noise_sigma
@@ -1381,7 +1393,7 @@ def test_full_loop_no_noise(self):
         result_sum = torch.sum(torch.abs(sample))
         result_mean = torch.mean(torch.abs(sample))
 
-        if str(torch_device).startswith("cpu"):
+        if torch_device in ["cpu", "mps"]:
             assert abs(result_sum.item() - 152.3192) < 1e-2
             assert abs(result_mean.item() - 0.1983) < 1e-3
         else:
@@ -1396,7 +1408,11 @@ def test_full_loop_device(self):
 
         scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
 
-        generator = torch.Generator(device=torch_device).manual_seed(0)
+        if torch_device == "mps":
+            # device type MPS is not supported for torch.Generator() api.
+            generator = torch.manual_seed(0)
+        else:
+            generator = torch.Generator(device=torch_device).manual_seed(0)
 
         model = self.dummy_model()
         sample = self.dummy_sample_deter * scheduler.init_noise_sigma

From 2e980ac9a0c03200cbe52e78e23f648d17e97b9c Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Thu, 10 Nov 2022 00:44:42 +0100
Subject: [PATCH 175/269] [Tests] Adjust TPU test values (#1233)

* [Tests] Adjust TPU test values

* slow tests

* remaining refs
---
 tests/test_scheduler_flax.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_scheduler_flax.py b/tests/test_scheduler_flax.py
index d29a8bfcc210..7928939f2db4 100644
--- a/tests/test_scheduler_flax.py
+++ b/tests/test_scheduler_flax.py
@@ -859,7 +859,7 @@ def test_full_loop_no_noise(self):
         result_mean = jnp.mean(jnp.abs(sample))
 
         if jax_device == "tpu":
-            assert abs(result_sum - 198.1542) < 1e-2
+            assert abs(result_sum - 198.1275) < 1e-2
             assert abs(result_mean - 0.2580) < 1e-3
         else:
             assert abs(result_sum - 198.1318) < 1e-2
@@ -872,8 +872,8 @@ def test_full_loop_with_set_alpha_to_one(self):
         result_mean = jnp.mean(jnp.abs(sample))
 
         if jax_device == "tpu":
-            assert abs(result_sum - 185.4352) < 1e-2
-            assert abs(result_mean - 0.24145) < 1e-3
+            assert abs(result_sum - 186.83226) < 1e-2
+            assert abs(result_mean - 0.24327) < 1e-3
         else:
             assert abs(result_sum - 186.9466) < 1e-2
             assert abs(result_mean - 0.24342) < 1e-3
@@ -885,8 +885,8 @@ def test_full_loop_with_no_set_alpha_to_one(self):
         result_mean = jnp.mean(jnp.abs(sample))
 
         if jax_device == "tpu":
-            assert abs(result_sum - 185.4352) < 1e-2
-            assert abs(result_mean - 0.2414) < 1e-3
+            assert abs(result_sum - 186.83226) < 1e-2
+            assert abs(result_mean - 0.24327) < 1e-3
         else:
             assert abs(result_sum - 186.9482) < 1e-2
             assert abs(result_mean - 0.2434) < 1e-3

From a09d47532d791bec28926e21b3b73124175be595 Mon Sep 17 00:00:00 2001
From: apolinario <joaopaulo.passos@gmail.com>
Date: Thu, 10 Nov 2022 14:37:42 +0100
Subject: [PATCH 176/269] Add a reference to the name 'Sampler' (#1172)

* Add a reference to the name 'Sampler'

- Facilitate people that are familiar with the name samplers to understand that we call that schedulers
- Better SEO if people are googling for samplers to find our library as well

* Update README.md with a reference to 'Sampler'
---
 README.md                      | 2 +-
 docs/source/api/schedulers.mdx | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5c7b911c2ede..64cbd15aab26 100644
--- a/README.md
+++ b/README.md
@@ -428,7 +428,7 @@ If you just want to play around with some web demos, you can try out the followi
 <p>
     
 **Schedulers**: Algorithm class for both **inference** and **training**.
-The class provides functionality to compute previous image according to alpha, beta schedule as well as predict noise for training.
+The class provides functionality to compute previous image according to alpha, beta schedule as well as predict noise for training. Also known as **Samplers**.
 *Examples*: [DDPM](https://arxiv.org/abs/2006.11239), [DDIM](https://arxiv.org/abs/2010.02502), [PNDM](https://arxiv.org/abs/2202.09778), [DEIS](https://arxiv.org/abs/2204.13902)
 
 <p align="center">
diff --git a/docs/source/api/schedulers.mdx b/docs/source/api/schedulers.mdx
index 12575a5ecae2..7ed527bedf3f 100644
--- a/docs/source/api/schedulers.mdx
+++ b/docs/source/api/schedulers.mdx
@@ -16,7 +16,7 @@ Diffusers contains multiple pre-built schedule functions for the diffusion proce
 
 ## What is a scheduler?
 
-The schedule functions, denoted *Schedulers* in the library take in the output of a trained model, a sample which the diffusion process is iterating on, and a timestep to return a denoised sample.
+The schedule functions, denoted *Schedulers* in the library take in the output of a trained model, a sample which the diffusion process is iterating on, and a timestep to return a denoised sample. That's why schedulers may also be called *Samplers* in other diffusion models implementations.
 
 - Schedulers define the methodology for iteratively adding noise to an image or for updating a sample based on model outputs.
     - adding noise in different manners represent the algorithmic processes to train a diffusion model by adding noise to images.

From 045157a46fb16a21a5f37a5f3f3ad710895b680b Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Thu, 10 Nov 2022 16:00:17 +0100
Subject: [PATCH 177/269] Fix Flax usage comments (#1211)

* Fix Flax usage comments (they didn't work).

* Spell out dtype

* make style
---
 src/diffusers/pipeline_flax_utils.py | 33 ++++++++++++++++++----------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/diffusers/pipeline_flax_utils.py b/src/diffusers/pipeline_flax_utils.py
index 3963c80cd1cd..4c34e64f782a 100644
--- a/src/diffusers/pipeline_flax_utils.py
+++ b/src/diffusers/pipeline_flax_utils.py
@@ -268,18 +268,27 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         >>> from diffusers import FlaxDiffusionPipeline
 
         >>> # Download pipeline from huggingface.co and cache.
-        >>> pipeline = FlaxDiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
-
-        >>> # Download pipeline that requires an authorization token
-        >>> # For more information on access tokens, please refer to this section
-        >>> # of the documentation](https://huggingface.co/docs/hub/security-tokens)
-        >>> pipeline = FlaxDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-
-        >>> # Download pipeline, but overwrite scheduler
-        >>> from diffusers import LMSDiscreteScheduler
-
-        >>> scheduler = LMSDiscreteScheduler.from_config("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
-        >>> pipeline = FlaxDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", scheduler=scheduler)
+        >>> # Requires to be logged in to Hugging Face hub,
+        >>> # see more in [the documentation](https://huggingface.co/docs/hub/security-tokens)
+        >>> pipeline, params = FlaxDiffusionPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5",
+        ...     revision="bf16",
+        ...     dtype=jnp.bfloat16,
+        ... )
+
+        >>> # Download pipeline, but use a different scheduler
+        >>> from diffusers import FlaxDPMSolverMultistepScheduler
+
+        >>> model_id = "runwayml/stable-diffusion-v1-5"
+        >>> sched, sched_state = FlaxDPMSolverMultistepScheduler.from_config(
+        ...     model_id,
+        ...     subfolder="scheduler",
+        ... )
+
+        >>> dpm_pipe, dpm_params = FlaxStableDiffusionPipeline.from_pretrained(
+        ...     model_id, revision="bf16", dtype=jnp.bfloat16, scheduler=dpmpp
+        ... )
+        >>> dpm_params["scheduler"] = dpmpp_state
         ```
         """
         cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)

From 8171566163f0b197282786bf39de95c130eb5fa0 Mon Sep 17 00:00:00 2001
From: ruanrz <45003574+ruanrz@users.noreply.github.com>
Date: Fri, 11 Nov 2022 19:28:20 +0800
Subject: [PATCH 178/269] [Docs] improve img2img example (#1193)

update img2img example
---
 docs/source/using-diffusers/img2img.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/using-diffusers/img2img.mdx b/docs/source/using-diffusers/img2img.mdx
index defefffe063e..911d7bd76ad0 100644
--- a/docs/source/using-diffusers/img2img.mdx
+++ b/docs/source/using-diffusers/img2img.mdx
@@ -33,7 +33,7 @@ url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/st
 
 response = requests.get(url)
 init_image = Image.open(BytesIO(response.content)).convert("RGB")
-init_image = init_image.resize((768, 512))
+init_image.thumbnail((768, 768))
 
 prompt = "A fantasy landscape, trending on artstation"
 

From 4c660d16d0c4c3b3b413f8d71b12831ccdb039af Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Sun, 13 Nov 2022 20:19:55 +0100
Subject: [PATCH 179/269] [Stable Diffusion] Fix padding / truncation (#1226)

* [Stable Diffusion] Fix padding / truncation

* finish
---
 .../pipeline_cycle_diffusion.py               |  7 +--
 .../pipeline_onnx_stable_diffusion.py         |  8 +--
 .../pipeline_onnx_stable_diffusion_img2img.py |  8 +--
 .../pipeline_onnx_stable_diffusion_inpaint.py |  8 +--
 .../pipeline_stable_diffusion.py              |  7 +--
 .../pipeline_stable_diffusion_img2img.py      |  7 +--
 .../pipeline_stable_diffusion_inpaint.py      |  7 +--
 ...ipeline_stable_diffusion_inpaint_legacy.py |  7 +--
 .../stable_diffusion/test_stable_diffusion.py | 54 ++++++++++++++++++-
 9 files changed, 88 insertions(+), 25 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index 50f519c77f52..528dd33794b1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -248,17 +248,18 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
             prompt,
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
+            truncation=True,
             return_tensors="pt",
         )
         text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
 
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+        if not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
         text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
index d1e2704fceeb..eceefea87426 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
@@ -114,17 +114,19 @@ def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guida
             prompt,
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
+            truncation=True,
             return_tensors="np",
         )
         text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
 
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+        if not np.array_equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
         text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
         text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0)
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
index a09dfe751f4c..483b5fd2d344 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -161,17 +161,19 @@ def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guida
             prompt,
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
+            truncation=True,
             return_tensors="np",
         )
         text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
 
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+        if not np.array_equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
         text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
         text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0)
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
index 6c226dd432f5..8e5c20131935 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -175,17 +175,19 @@ def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guida
             prompt,
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
+            truncation=True,
             return_tensors="np",
         )
         text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
 
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+        if not np.array_equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
         text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
         text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0)
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index ed5246fa91b6..450fbbfb1771 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -236,17 +236,18 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
             prompt,
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
+            truncation=True,
             return_tensors="pt",
         )
         text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
 
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+        if not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
         text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index b4ecf600b10e..98c813eed1dd 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -244,17 +244,18 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
             prompt,
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
+            truncation=True,
             return_tensors="pt",
         )
         text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
 
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+        if not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
         text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index f42325261a01..3f08f6edae84 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -244,17 +244,18 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
             prompt,
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
+            truncation=True,
             return_tensors="pt",
         )
         text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
 
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+        if not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
         text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 89d40f7a79ae..612aa3c1264f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -213,17 +213,18 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
             prompt,
             padding="max_length",
             max_length=self.tokenizer.model_max_length,
+            truncation=True,
             return_tensors="pt",
         )
         text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
 
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+        if not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
             logger.warning(
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
         text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 6e1071124cb7..87d238c869a3 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -33,9 +33,10 @@
     UNet2DConditionModel,
     UNet2DModel,
     VQModel,
+    logging,
 )
 from diffusers.utils import floats_tensor, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
+from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
 from ...test_pipelines_common import PipelineTesterMixin
@@ -619,6 +620,57 @@ def test_stable_diffusion_fp16(self):
 
         assert image.shape == (1, 128, 128, 3)
 
+    def test_stable_diffusion_long_prompt(self):
+        unet = self.dummy_cond_unet
+        scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        do_classifier_free_guidance = True
+        negative_prompt = None
+        num_images_per_prompt = 1
+        logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
+
+        prompt = 25 * "@"
+        with CaptureLogger(logger) as cap_logger_3:
+            text_embeddings_3 = sd_pipe._encode_prompt(
+                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+
+        prompt = 100 * "@"
+        with CaptureLogger(logger) as cap_logger:
+            text_embeddings = sd_pipe._encode_prompt(
+                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+
+        negative_prompt = "Hello"
+        with CaptureLogger(logger) as cap_logger_2:
+            text_embeddings_2 = sd_pipe._encode_prompt(
+                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+
+        assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
+        assert text_embeddings.shape[1] == 77
+
+        assert cap_logger.out == cap_logger_2.out
+        # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25
+        assert cap_logger.out.count("@") == 25
+        assert cap_logger_3.out == ""
+
 
 @slow
 @require_torch_gpu

From b3c5e086e5cd41712b7627894f814edf5fe38647 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Sun, 13 Nov 2022 23:54:30 +0100
Subject: [PATCH 180/269] Finalize stable diffusion refactor (#1269)

* finish

* cleaner

* more fixes

* refactor

* make fix copies

* refactor cycle diffusion

* finish

* finish2

* Apply suggestions from code review
---
 .../pipeline_cycle_diffusion.py               | 273 ++++++++++-------
 .../pipeline_stable_diffusion.py              | 164 ++++++-----
 .../pipeline_stable_diffusion_img2img.py      | 230 +++++++++------
 .../pipeline_stable_diffusion_inpaint.py      | 277 +++++++++++-------
 ...ipeline_stable_diffusion_inpaint_legacy.py | 262 +++++++++++------
 .../test_stable_diffusion_img2img.py          |  47 ++-
 .../test_stable_diffusion_inpaint_legacy.py   |   1 -
 7 files changed, 778 insertions(+), 476 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index 528dd33794b1..dfdb58de4d0f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -19,6 +19,7 @@
 import torch
 
 import PIL
+from diffusers.utils import is_accelerate_available
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -178,6 +179,7 @@ def __init__(
             feature_extractor=feature_extractor,
         )
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing
     def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
         r"""
         Enable sliced attention computation.
@@ -197,14 +199,33 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto
             slice_size = self.unet.config.attention_head_dim // 2
         self.unet.set_attention_slice(slice_size)
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_attention_slicing
     def disable_attention_slicing(self):
         r"""
         Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
         back to computing attention in one step.
         """
-        # set slice_size = `None` to disable `set_attention_slice`
+        # set slice_size = `None` to disable `attention slicing`
         self.enable_attention_slicing(None)
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
+    def enable_sequential_cpu_offload(self):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device("cuda")
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
     @property
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
     def _execution_device(self):
@@ -224,6 +245,26 @@ def _execution_device(self):
                 return torch.device(module._hf_hook.execution_device)
         return self.device
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention
+    def enable_xformers_memory_efficient_attention(self):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(False)
+
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
         r"""
@@ -310,6 +351,106 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
 
         return text_embeddings
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs
+    def check_inputs(self, prompt, strength, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [1.0, 1.0] but is {strength}")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        offset = self.scheduler.config.get("steps_offset", 0)
+        init_timestep = int(num_inference_steps * strength) + offset
+        init_timestep = min(init_timestep, num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep + offset, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps
+
+    def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        init_image = init_image.to(device=device, dtype=dtype)
+        init_latent_dist = self.vae.encode(init_image).latent_dist
+        init_latents = init_latent_dist.sample(generator=generator)
+        init_latents = 0.18215 * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many init images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
+
+        # add noise to latents using the timestep
+        noise = torch.randn(init_latents.shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        clean_latents = init_latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents, clean_latents
+
     @torch.no_grad()
     def __call__(
         self,
@@ -384,112 +525,43 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if batch_size != 1:
-            raise ValueError(
-                "At the moment only `batch_size=1` is supported for prompts, but you seem to have passed multiple"
-                f" prompts: {prompt}. Please make sure to pass only a single prompt."
-            )
-
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        if isinstance(init_image, PIL.Image.Image):
-            init_image = preprocess(init_image)
+        # 1. Check inputs
+        self.check_inputs(prompt, strength, callback_steps)
 
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
         device = self._execution_device
-
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
 
+        # 3. Encode input prompt
         text_embeddings = self._encode_prompt(prompt, device, num_images_per_prompt, do_classifier_free_guidance, None)
         source_text_embeddings = self._encode_prompt(
             source_prompt, device, num_images_per_prompt, do_classifier_free_guidance, None
         )
 
-        # encode the init image into latents and scale the latents
-        latents_dtype = text_embeddings.dtype
-        init_image = init_image.to(device=self.device, dtype=latents_dtype)
-        init_latent_dist = self.vae.encode(init_image).latent_dist
-        init_latents = init_latent_dist.sample(generator=generator)
-        init_latents = 0.18215 * init_latents
-
-        if isinstance(prompt, str):
-            prompt = [prompt]
-        if len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] == 0:
-            # expand init_latents for batch_size
-            deprecation_message = (
-                f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
-                " images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
-                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
-                " your script to pass as many init images as text prompts to suppress this warning."
-            )
-            deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
-            additional_image_per_prompt = len(prompt) // init_latents.shape[0]
-            init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
-        elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
-            )
-        else:
-            init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
-
-        # get the original timestep using init_timestep
-        offset = self.scheduler.config.get("steps_offset", 0)
-        init_timestep = int(num_inference_steps * strength) + offset
-        init_timestep = min(init_timestep, num_inference_steps)
-
-        timesteps = self.scheduler.timesteps[-init_timestep]
-        timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt, device=self.device)
-
-        # add noise to latents using the timesteps
-        noise = torch.randn(init_latents.shape, generator=generator, device=self.device, dtype=latents_dtype)
-        clean_latents = init_latents
-        init_latents = self.scheduler.add_noise(init_latents, noise, timesteps)
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-
-        if not (accepts_eta and (0 < eta <= 1)):
-            raise ValueError(
-                "Currently, only the DDIM scheduler is supported. Please make sure that `pipeline.scheduler` is of"
-                f" type {DDIMScheduler.__class__} and not {self.scheduler.__class__}."
-            )
+        # 4. Preprocess image
+        if isinstance(init_image, PIL.Image.Image):
+            init_image = preprocess(init_image)
 
-        extra_step_kwargs["eta"] = eta
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
 
-        latents = init_latents
-        source_latents = init_latents
-
-        t_start = max(num_inference_steps - init_timestep + offset, 0)
+        # 6. Prepare latent variables
+        latents, clean_latents = self.prepare_latents(
+            init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
+        )
+        source_latents = latents
 
-        # Some schedulers like PNDM have timesteps as arrays
-        # It's more optimized to move all timesteps to correct device beforehand
-        timesteps = self.scheduler.timesteps[t_start:].to(self.device)
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        generator = extra_step_kwargs.pop("generator", None)
 
+        # 8. Denoising loop
         for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2)
@@ -551,22 +623,13 @@ def __call__(
             if callback is not None and i % callback_steps == 0:
                 callback(i, t, latents)
 
-        latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
-
-        image = (image / 2 + 0.5).clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        # 9. Post-processing
+        image = self.decode_latents(latents)
 
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
-                self.device
-            )
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
-            )
-        else:
-            has_nsfw_concept = None
+        # 10. Run safety checker
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
 
+        # 11. Convert to PIL
         if output_type == "pil":
             image = self.numpy_to_pil(image)
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 450fbbfb1771..e6353472933c 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -298,6 +298,73 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
 
         return text_embeddings
 
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // 8, width // 8)
+        if latents is None:
+            if device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+            else:
+                latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
     @torch.no_grad()
     def __call__(
         self,
@@ -371,75 +438,45 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
 
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
         device = self._execution_device
-
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
 
+        # 3. Encode input prompt
         text_embeddings = self._encode_prompt(
             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
         )
 
-        # Unlike in other pipelines, latents need to be generated in the target device
-        # for 1-to-1 results reproducibility with the CompVis implementation.
-        # However this currently doesn't work in `mps`.
-
-        # get the initial random noise unless the user supplied it
-        latents_shape = (batch_size * num_images_per_prompt, self.unet.in_channels, height // 8, width // 8)
-        latents_dtype = text_embeddings.dtype
-        if latents is None:
-            if device.type == "mps":
-                # randn does not work reproducibly on mps
-                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(device)
-            else:
-                latents = torch.randn(latents_shape, generator=generator, device=device, dtype=latents_dtype)
-        else:
-            if latents.shape != latents_shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-            latents = latents.to(device)
-
-        # set timesteps and move to the correct device
+        # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps_tensor = self.scheduler.timesteps
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
 
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
-        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+        # 7. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
             latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
@@ -459,22 +496,13 @@ def __call__(
             if callback is not None and i % callback_steps == 0:
                 callback(i, t, latents)
 
-        latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
-
-        image = (image / 2 + 0.5).clamp(0, 1)
-
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        # 8. Post-processing
+        image = self.decode_latents(latents)
 
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
-            )
-        else:
-            has_nsfw_concept = None
+        # 9. Run safety checker
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
 
+        # 10. Convert to PIL
         if output_type == "pil":
             image = self.numpy_to_pil(image)
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 98c813eed1dd..9df800dc2d61 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -27,6 +27,7 @@
 from ...pipeline_utils import DiffusionPipeline
 from ...schedulers import (
     DDIMScheduler,
+    DPMSolverMultistepScheduler,
     EulerAncestralDiscreteScheduler,
     EulerDiscreteScheduler,
     LMSDiscreteScheduler,
@@ -78,6 +79,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__
     def __init__(
         self,
         vae: AutoencoderKL,
@@ -85,7 +87,12 @@ def __init__(
         tokenizer: CLIPTokenizer,
         unet: UNet2DConditionModel,
         scheduler: Union[
-            DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler, EulerDiscreteScheduler, EulerAncestralDiscreteScheduler
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
         ],
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
@@ -139,6 +146,7 @@ def __init__(
             feature_extractor=feature_extractor,
         )
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing
     def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
         r"""
         Enable sliced attention computation.
@@ -158,14 +166,16 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto
             slice_size = self.unet.config.attention_head_dim // 2
         self.unet.set_attention_slice(slice_size)
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_attention_slicing
     def disable_attention_slicing(self):
         r"""
         Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
         back to computing attention in one step.
         """
-        # set slice_size = `None` to disable `set_attention_slice`
+        # set slice_size = `None` to disable `attention slicing`
         self.enable_attention_slicing(None)
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
     def enable_sequential_cpu_offload(self):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
@@ -202,6 +212,7 @@ def _execution_device(self):
                 return torch.device(module._hf_hook.execution_device)
         return self.device
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention
     def enable_xformers_memory_efficient_attention(self):
         r"""
         Enable memory efficient attention as implemented in xformers.
@@ -214,6 +225,7 @@ def enable_xformers_memory_efficient_attention(self):
         """
         self.unet.set_use_memory_efficient_attention_xformers(True)
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention
     def disable_xformers_memory_efficient_attention(self):
         r"""
         Disable memory efficient attention as implemented in xformers.
@@ -306,6 +318,103 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
 
         return text_embeddings
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(self, prompt, strength, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [1.0, 1.0] but is {strength}")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        offset = self.scheduler.config.get("steps_offset", 0)
+        init_timestep = int(num_inference_steps * strength) + offset
+        init_timestep = min(init_timestep, num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep + offset, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps
+
+    def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        init_image = init_image.to(device=device, dtype=dtype)
+        init_latent_dist = self.vae.encode(init_image).latent_dist
+        init_latents = init_latent_dist.sample(generator=generator)
+        init_latents = 0.18215 * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many init images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
+
+        # add noise to latents using the timesteps
+        noise = torch.randn(init_latents.shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
     @torch.no_grad()
     def __call__(
         self,
@@ -379,102 +488,40 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
+        # 1. Check inputs
+        self.check_inputs(prompt, strength, callback_steps)
 
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
         device = self._execution_device
-
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        if isinstance(init_image, PIL.Image.Image):
-            init_image = preprocess(init_image)
-
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
 
+        # 3. Encode input prompt
         text_embeddings = self._encode_prompt(
             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
         )
 
-        # encode the init image into latents and scale the latents
-        latents_dtype = text_embeddings.dtype
-        init_image = init_image.to(device=device, dtype=latents_dtype)
-        init_latent_dist = self.vae.encode(init_image).latent_dist
-        init_latents = init_latent_dist.sample(generator=generator)
-        init_latents = 0.18215 * init_latents
-
-        if isinstance(prompt, str):
-            prompt = [prompt]
-        if len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] == 0:
-            # expand init_latents for batch_size
-            deprecation_message = (
-                f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
-                " images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
-                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
-                " your script to pass as many init images as text prompts to suppress this warning."
-            )
-            deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
-            additional_image_per_prompt = len(prompt) // init_latents.shape[0]
-            init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
-        elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
-            )
-        else:
-            init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
-
-        # get the original timestep using init_timestep
-        offset = self.scheduler.config.get("steps_offset", 0)
-        init_timestep = int(num_inference_steps * strength) + offset
-        init_timestep = min(init_timestep, num_inference_steps)
-
-        timesteps = self.scheduler.timesteps[-init_timestep]
-        timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt, device=device)
-
-        # add noise to latents using the timesteps
-        noise = torch.randn(init_latents.shape, generator=generator, device=device, dtype=latents_dtype)
-        init_latents = self.scheduler.add_noise(init_latents, noise, timesteps)
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
+        # 4. Preprocess image
+        if isinstance(init_image, PIL.Image.Image):
+            init_image = preprocess(init_image)
 
-        latents = init_latents
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
 
-        t_start = max(num_inference_steps - init_timestep + offset, 0)
+        # 6. Prepare latent variables
+        latents = self.prepare_latents(
+            init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
+        )
 
-        # Some schedulers like PNDM have timesteps as arrays
-        # It's more optimized to move all timesteps to correct device beforehand
-        timesteps = self.scheduler.timesteps[t_start:].to(device)
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
+        # 8. Denoising loop
         for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
@@ -495,20 +542,13 @@ def __call__(
             if callback is not None and i % callback_steps == 0:
                 callback(i, t, latents)
 
-        latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
+        # 9. Post-processing
+        image = self.decode_latents(latents)
 
-        image = (image / 2 + 0.5).clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).numpy()
-
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
-            )
-        else:
-            has_nsfw_concept = None
+        # 10. Run safety checker
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
 
+        # 11. Convert to PIL
         if output_type == "pil":
             image = self.numpy_to_pil(image)
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 3f08f6edae84..332eb2ca777f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -139,6 +139,7 @@ def __init__(
             feature_extractor=feature_extractor,
         )
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing
     def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
         r"""
         Enable sliced attention computation.
@@ -158,6 +159,7 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto
             slice_size = self.unet.config.attention_head_dim // 2
         self.unet.set_attention_slice(slice_size)
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_attention_slicing
     def disable_attention_slicing(self):
         r"""
         Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
@@ -166,6 +168,7 @@ def disable_attention_slicing(self):
         # set slice_size = `None` to disable `attention slicing`
         self.enable_attention_slicing(None)
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
     def enable_sequential_cpu_offload(self):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
@@ -183,6 +186,26 @@ def enable_sequential_cpu_offload(self):
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention
+    def enable_xformers_memory_efficient_attention(self):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(False)
+
     @property
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
     def _execution_device(self):
@@ -202,24 +225,6 @@ def _execution_device(self):
                 return torch.device(module._hf_hook.execution_device)
         return self.device
 
-    def enable_xformers_memory_efficient_attention(self):
-        r"""
-        Enable memory efficient attention as implemented in xformers.
-
-        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
-        time. Speed up at training time is not guaranteed.
-
-        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
-        is used.
-        """
-        self.unet.set_use_memory_efficient_attention_xformers(True)
-
-    def disable_xformers_memory_efficient_attention(self):
-        r"""
-        Disable memory efficient attention as implemented in xformers.
-        """
-        self.unet.set_use_memory_efficient_attention_xformers(False)
-
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
     def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
         r"""
@@ -306,6 +311,106 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
 
         return text_embeddings
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // 8, width // 8)
+        if latents is None:
+            if device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+            else:
+                latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(mask, size=(height // 8, width // 8))
+        mask = mask.to(device=device, dtype=dtype)
+
+        masked_image = masked_image.to(device=device, dtype=dtype)
+
+        # encode the mask image into latents space so we can concatenate it to the latents
+        masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
+        masked_image_latents = 0.18215 * masked_image_latents
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        mask = mask.repeat(batch_size, 1, 1, 1)
+        masked_image_latents = masked_image_latents.repeat(batch_size, 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
+
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+        return mask, masked_image_latents
+
     @torch.no_grad()
     def __call__(
         self,
@@ -390,83 +495,59 @@ def __call__(
             (nsfw) content, according to the `safety_checker`.
         """
 
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
+        # 1. Check inputs
+        self.check_inputs(prompt, height, width, callback_steps)
 
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
         device = self._execution_device
-
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
 
+        # 3. Encode input prompt
         text_embeddings = self._encode_prompt(
             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
         )
 
-        # get the initial random noise unless the user supplied it
-        # Unlike in other pipelines, latents need to be generated in the target device
-        # for 1-to-1 results reproducibility with the CompVis implementation.
-        # However this currently doesn't work in `mps`.
-        num_channels_latents = self.vae.config.latent_channels
-        latents_shape = (batch_size * num_images_per_prompt, num_channels_latents, height // 8, width // 8)
-        latents_dtype = text_embeddings.dtype
-        if latents is None:
-            if device.type == "mps":
-                # randn does not exist on mps
-                latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(device)
-            else:
-                latents = torch.randn(latents_shape, generator=generator, device=device, dtype=latents_dtype)
-        else:
-            if latents.shape != latents_shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-            latents = latents.to(device)
-
-        # prepare mask and masked_image
-        mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
+        # 4. Preprocess mask and image
+        if isinstance(image, PIL.Image.Image) and isinstance(mask_image, PIL.Image.Image):
+            mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
 
-        # resize the mask to latents shape as we concatenate the mask to the latents
-        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
-        # and half precision
-        mask = torch.nn.functional.interpolate(mask, size=(height // 8, width // 8))
-        mask = mask.to(device=device, dtype=text_embeddings.dtype)
-
-        masked_image = masked_image.to(device=device, dtype=text_embeddings.dtype)
-
-        # encode the mask image into latents space so we can concatenate it to the latents
-        masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
-        masked_image_latents = 0.18215 * masked_image_latents
-
-        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
-        mask = mask.repeat(batch_size * num_images_per_prompt, 1, 1, 1)
-        masked_image_latents = masked_image_latents.repeat(batch_size * num_images_per_prompt, 1, 1, 1)
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps_tensor = self.scheduler.timesteps
 
-        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
-        masked_image_latents = (
-            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
         )
 
-        # aligning device to prevent device errors when concating it with the latent model input
-        masked_image_latents = masked_image_latents.to(device=device, dtype=text_embeddings.dtype)
+        # 7. Prepare mask latent variables
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask,
+            masked_image,
+            batch_size,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            do_classifier_free_guidance,
+        )
 
+        # 8. Check that sizes of mask, masked image and latents match
         num_channels_mask = mask.shape[1]
         num_channels_masked_image = masked_image_latents.shape[1]
-
         if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
             raise ValueError(
                 f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
@@ -476,27 +557,10 @@ def __call__(
                 " `pipeline.unet` or your `mask_image` or `image` input."
             )
 
-        # set timesteps and move to the correct device
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps_tensor = self.scheduler.timesteps
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
+        # 10. Denoising loop
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
@@ -521,22 +585,13 @@ def __call__(
             if callback is not None and i % callback_steps == 0:
                 callback(i, t, latents)
 
-        latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
-
-        image = (image / 2 + 0.5).clamp(0, 1)
-
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        # 11. Post-processing
+        image = self.decode_latents(latents)
 
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
-            )
-        else:
-            has_nsfw_concept = None
+        # 12. Run safety checker
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
 
+        # 13. Convert to PIL
         if output_type == "pil":
             image = self.numpy_to_pil(image)
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 612aa3c1264f..86d879eaa8c3 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -19,13 +19,20 @@
 import torch
 
 import PIL
-from tqdm.auto import tqdm
+from diffusers.utils import is_accelerate_available
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...pipeline_utils import DiffusionPipeline
-from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
 from ...utils import deprecate, logging
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
@@ -85,17 +92,26 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__
     def __init__(
         self,
         vae: AutoencoderKL,
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
         unet: UNet2DConditionModel,
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
     ):
         super().__init__()
+
         if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
@@ -143,6 +159,7 @@ def __init__(
             feature_extractor=feature_extractor,
         )
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing
     def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
         r"""
         Enable sliced attention computation.
@@ -162,14 +179,53 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto
             slice_size = self.unet.config.attention_head_dim // 2
         self.unet.set_attention_slice(slice_size)
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_attention_slicing
     def disable_attention_slicing(self):
         r"""
         Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
         back to computing attention in one step.
         """
-        # set slice_size = `None` to disable `set_attention_slice`
+        # set slice_size = `None` to disable `attention slicing`
         self.enable_attention_slicing(None)
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
+    def enable_sequential_cpu_offload(self):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device("cuda")
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention
+    def enable_xformers_memory_efficient_attention(self):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(False)
+
     @property
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
     def _execution_device(self):
@@ -275,6 +331,88 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
 
         return text_embeddings
 
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs
+    def check_inputs(self, prompt, strength, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [1.0, 1.0] but is {strength}")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        offset = self.scheduler.config.get("steps_offset", 0)
+        init_timestep = int(num_inference_steps * strength) + offset
+        init_timestep = min(init_timestep, num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep + offset, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps
+
+    def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator):
+        init_image = init_image.to(device=self.device, dtype=dtype)
+        init_latent_dist = self.vae.encode(init_image).latent_dist
+        init_latents = init_latent_dist.sample(generator=generator)
+        init_latents = 0.18215 * init_latents
+
+        # Expand init_latents for batch_size and num_images_per_prompt
+        init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
+        init_latents_orig = init_latents
+
+        # add noise to latents using the timesteps
+        noise = torch.randn(init_latents.shape, generator=generator, device=self.device, dtype=dtype)
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+        return latents, init_latents_orig, noise
+
     @torch.no_grad()
     def __call__(
         self,
@@ -353,98 +491,49 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
+        # 1. Check inputs
+        self.check_inputs(prompt, strength, callback_steps)
 
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
         device = self._execution_device
-
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        # preprocess image
-        if not isinstance(init_image, torch.FloatTensor):
-            init_image = preprocess_image(init_image)
-
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
 
+        # 3. Encode input prompt
         text_embeddings = self._encode_prompt(
             prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
         )
 
-        # encode the init image into latents and scale the latents
-        latents_dtype = text_embeddings.dtype
-        init_image = init_image.to(device=self.device, dtype=latents_dtype)
-        init_latent_dist = self.vae.encode(init_image).latent_dist
-        init_latents = init_latent_dist.sample(generator=generator)
-        init_latents = 0.18215 * init_latents
-
-        # Expand init_latents for batch_size and num_images_per_prompt
-        init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
-        init_latents_orig = init_latents
+        # 4. Preprocess image and mask
+        if not isinstance(init_image, torch.FloatTensor):
+            init_image = preprocess_image(init_image)
 
-        # preprocess mask
         if not isinstance(mask_image, torch.FloatTensor):
             mask_image = preprocess_mask(mask_image)
-        mask_image = mask_image.to(device=self.device, dtype=latents_dtype)
-        mask = torch.cat([mask_image] * batch_size * num_images_per_prompt)
 
-        # check sizes
-        if not mask.shape == init_latents.shape:
-            raise ValueError("The mask and init_image should be the same size!")
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
 
-        # get the original timestep using init_timestep
-        offset = self.scheduler.config.get("steps_offset", 0)
-        init_timestep = int(num_inference_steps * strength) + offset
-        init_timestep = min(init_timestep, num_inference_steps)
-
-        timesteps = self.scheduler.timesteps[-init_timestep]
-        timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt, device=self.device)
-
-        # add noise to latents using the timesteps
-        noise = torch.randn(init_latents.shape, generator=generator, device=self.device, dtype=latents_dtype)
-        init_latents = self.scheduler.add_noise(init_latents, noise, timesteps)
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-
-        latents = init_latents
+        # 6. Prepare latent variables
+        # encode the init image into latents and scale the latents
+        latents, init_latents_orig, noise = self.prepare_latents(
+            init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
+        )
 
-        t_start = max(num_inference_steps - init_timestep + offset, 0)
+        # 7. Prepare mask latent
+        mask = mask_image.to(device=self.device, dtype=latents.dtype)
+        mask = torch.cat([mask] * batch_size * num_images_per_prompt)
 
-        # Some schedulers like PNDM have timesteps as arrays
-        # It's more optimized to move all timesteps to correct device beforehand
-        timesteps = self.scheduler.timesteps[t_start:].to(self.device)
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
 
-        for i, t in tqdm(enumerate(timesteps)):
+        # 9. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
             # expand the latents if we are doing classifier free guidance
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
             latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
@@ -468,22 +557,13 @@ def __call__(
             if callback is not None and i % callback_steps == 0:
                 callback(i, t, latents)
 
-        latents = 1 / 0.18215 * latents
-        image = self.vae.decode(latents).sample
-
-        image = (image / 2 + 0.5).clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        # 10. Post-processing
+        image = self.decode_latents(latents)
 
-        if self.safety_checker is not None:
-            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
-                self.device
-            )
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
-            )
-        else:
-            has_nsfw_concept = None
+        # 11. Run safety checker
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
 
+        # 12. Convert to PIL
         if output_type == "pil":
             image = self.numpy_to_pil(image)
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 6d5c6feab5bc..3c0fa8aa818f 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -22,6 +22,7 @@
 
 from diffusers import (
     AutoencoderKL,
+    DDIMScheduler,
     LMSDiscreteScheduler,
     PNDMScheduler,
     StableDiffusionImg2ImgPipeline,
@@ -479,7 +480,7 @@ def test_stable_diffusion_img2img_pipeline_default(self):
         )
         init_image = init_image.resize((768, 512))
         expected_image = load_numpy(
-            "https://huggingface.co/datasets/lewington/expected-images/resolve/main/fantasy_landscape.npy"
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/fantasy_landscape.npy"
         )
 
         model_id = "CompVis/stable-diffusion-v1-4"
@@ -506,7 +507,7 @@ def test_stable_diffusion_img2img_pipeline_default(self):
 
         assert image.shape == (512, 768, 3)
         # img2img is flaky across GPUs even in fp32, so using MAE here
-        assert np.abs(expected_image - image).mean() < 1e-3
+        assert np.abs(expected_image - image).max() < 1e-3
 
     def test_stable_diffusion_img2img_pipeline_k_lms(self):
         init_image = load_image(
@@ -515,7 +516,7 @@ def test_stable_diffusion_img2img_pipeline_k_lms(self):
         )
         init_image = init_image.resize((768, 512))
         expected_image = load_numpy(
-            "https://huggingface.co/datasets/lewington/expected-images/resolve/main/fantasy_landscape_k_lms.npy"
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/fantasy_landscape_k_lms.npy"
         )
 
         model_id = "CompVis/stable-diffusion-v1-4"
@@ -543,8 +544,44 @@ def test_stable_diffusion_img2img_pipeline_k_lms(self):
         image = output.images[0]
 
         assert image.shape == (512, 768, 3)
-        # img2img is flaky across GPUs even in fp32, so using MAE here
-        assert np.abs(expected_image - image).mean() < 1e-3
+        assert np.abs(expected_image - image).max() < 1e-3
+
+    def test_stable_diffusion_img2img_pipeline_ddim(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        )
+        init_image = init_image.resize((768, 512))
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/fantasy_landscape_ddim.npy"
+        )
+
+        model_id = "CompVis/stable-diffusion-v1-4"
+        ddim = DDIMScheduler.from_config(model_id, subfolder="scheduler")
+        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
+            model_id,
+            scheduler=ddim,
+            safety_checker=None,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "A fantasy landscape, trending on artstation"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            init_image=init_image,
+            strength=0.75,
+            guidance_scale=7.5,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 768, 3)
+        assert np.abs(expected_image - image).max() < 1e-3
 
     def test_stable_diffusion_img2img_intermediate_state(self):
         number_of_steps = 0
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
index c5b2572fb79a..4b535dc9dfbd 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
@@ -387,7 +387,6 @@ def test_stable_diffusion_inpaint_legacy_pipeline(self):
         assert np.abs(expected_image - image).max() < 1e-3
 
     def test_stable_diffusion_inpaint_legacy_pipeline_k_lms(self):
-        # TODO(Anton, Patrick) - I think we can remove this test soon
         init_image = load_image(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/in_paint/overture-creations-5sI6fQgYIuo.png"

From 33d7e89c42e0fe7b4a277d7a5bae12ba14828dd8 Mon Sep 17 00:00:00 2001
From: Lime-Cakes <91322985+Lime-Cakes@users.noreply.github.com>
Date: Mon, 14 Nov 2022 20:35:47 +0800
Subject: [PATCH 181/269] Edited attention.py for older xformers (#1270)

Older versions of xformers require query, key, value to be contiguous, this calls .contiguous() on q/k/v before passing to xformers.
---
 src/diffusers/models/attention.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index e8ea37970e04..be9203b4d699 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -557,6 +557,9 @@ def _sliced_attention(self, query, key, value, sequence_length, dim):
         return hidden_states
 
     def _memory_efficient_attention_xformers(self, query, key, value):
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
         hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=None)
         hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
         return hidden_states

From c9b34637039b8b7374ae15c70aa93289bc5b6ccd Mon Sep 17 00:00:00 2001
From: Partho <parthodas6176@gmail.com>
Date: Tue, 15 Nov 2022 01:12:14 +0530
Subject: [PATCH 182/269] Fix wrong link in text2img fine-tuning documentation
 (#1282)

fix link typo
---
 docs/source/training/text2image.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/training/text2image.mdx b/docs/source/training/text2image.mdx
index 1b04462f77f1..eb71457cb758 100644
--- a/docs/source/training/text2image.mdx
+++ b/docs/source/training/text2image.mdx
@@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License.
 
 # Stable Diffusion text-to-image fine-tuning
 
-The [`train_text_to_image.py`](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion) script shows how to fine-tune the stable diffusion model on your own dataset.
+The [`train_text_to_image.py`](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image) script shows how to fine-tune the stable diffusion model on your own dataset.
 
 <Tip warning={true}>
 

From ec7c8d32b088985ec73bd5fe967b8cf6f2c91144 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 14 Nov 2022 19:43:17 +0000
Subject: [PATCH 183/269] add conversion script for vae

---
 ...onvert_versatile_diffusion_to_diffusers.py | 748 ++++++++++++++++++
 v1-inference.yaml                             |  70 ++
 2 files changed, 818 insertions(+)
 create mode 100644 scripts/convert_versatile_diffusion_to_diffusers.py
 create mode 100644 v1-inference.yaml

diff --git a/scripts/convert_versatile_diffusion_to_diffusers.py b/scripts/convert_versatile_diffusion_to_diffusers.py
new file mode 100644
index 000000000000..20ac78f944ea
--- /dev/null
+++ b/scripts/convert_versatile_diffusion_to_diffusers.py
@@ -0,0 +1,748 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the Versatile Stable Diffusion checkpoints. """
+
+import argparse
+import os
+
+import torch
+
+
+try:
+    from omegaconf import OmegaConf
+except ImportError:
+    raise ImportError(
+        "OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`."
+    )
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LDMTextToImagePipeline,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from transformers import AutoFeatureExtractor, BertTokenizerFast, CLIPTextModel, CLIPTokenizer
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
+
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
+
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
+
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming
+    to them. It splits attention layers, and takes into account additional replacements
+    that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+def create_unet_diffusers_config(original_config):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    unet_params = original_config.model.params.unet_config.params
+
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    config = dict(
+        sample_size=unet_params.image_size,
+        in_channels=unet_params.in_channels,
+        out_channels=unet_params.out_channels,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        layers_per_block=unet_params.num_res_blocks,
+        cross_attention_dim=unet_params.context_dim,
+        attention_head_dim=unet_params.num_heads,
+    )
+
+    return config
+
+
+def create_vae_diffusers_config(original_config):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    _ = original_config.model.params.first_stage_config.params.embed_dim
+
+    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    config = dict(
+        sample_size=vae_params.resolution,
+        in_channels=vae_params.in_channels,
+        out_channels=vae_params.out_ch,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        latent_channels=vae_params.z_channels,
+        layers_per_block=vae_params.num_res_blocks,
+    )
+    return config
+
+
+def create_diffusers_schedular(original_config):
+    schedular = DDIMScheduler(
+        num_train_timesteps=original_config.model.params.timesteps,
+        beta_start=original_config.model.params.linear_start,
+        beta_end=original_config.model.params.linear_end,
+        beta_schedule="scaled_linear",
+    )
+    return schedular
+
+
+def create_ldm_bert_config(original_config):
+    bert_params = original_config.model.parms.cond_stage_config.params
+    config = LDMBertConfig(
+        d_model=bert_params.n_embed,
+        encoder_layers=bert_params.n_layer,
+        encoder_ffn_dim=bert_params.n_embed * 4,
+    )
+    return config
+
+
+def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+
+    unet_key = "model.diffusion_model."
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100:
+        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+        if extract_ema:
+            print(
+                "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+                " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+            )
+            for key in keys:
+                if key.startswith("model.diffusion_model"):
+                    flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+        else:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+
+    for key in keys:
+        if key.startswith(unet_key):
+            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            if ["conv.weight", "conv.bias"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    keys = list(checkpoint.keys())
+    for key in keys:
+        vae_state_dict[key] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+def convert_ldm_bert_checkpoint(checkpoint, config):
+    def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
+        hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight
+        hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight
+        hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight
+
+        hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight
+        hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias
+
+    def _copy_linear(hf_linear, pt_linear):
+        hf_linear.weight = pt_linear.weight
+        hf_linear.bias = pt_linear.bias
+
+    def _copy_layer(hf_layer, pt_layer):
+        # copy layer norms
+        _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0])
+        _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0])
+
+        # copy attn
+        _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1])
+
+        # copy MLP
+        pt_mlp = pt_layer[1][1]
+        _copy_linear(hf_layer.fc1, pt_mlp.net[0][0])
+        _copy_linear(hf_layer.fc2, pt_mlp.net[2])
+
+    def _copy_layers(hf_layers, pt_layers):
+        for i, hf_layer in enumerate(hf_layers):
+            if i != 0:
+                i += i
+            pt_layer = pt_layers[i : i + 2]
+            _copy_layer(hf_layer, pt_layer)
+
+    hf_model = LDMBertModel(config).eval()
+
+    # copy  embeds
+    hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight
+    hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight
+
+    # copy layer norm
+    _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm)
+
+    # copy hidden layers
+    _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers)
+
+    _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits)
+
+    return hf_model
+
+
+def convert_ldm_clip_checkpoint(checkpoint):
+    text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
+
+    keys = list(checkpoint.keys())
+
+    text_model_dict = {}
+
+    for key in keys:
+        if key.startswith("cond_stage_model.transformer"):
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
+
+    text_model.load_state_dict(text_model_dict)
+
+    return text_model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--unet_checkpoint_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--vae_checkpoint_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--optimus_checkpoint_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
+    )
+    # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
+    parser.add_argument(
+        "--original_config_file",
+        default=None,
+        type=str,
+        help="The YAML config file corresponding to the original architecture.",
+    )
+    parser.add_argument(
+        "--scheduler_type",
+        default="pndm",
+        type=str,
+        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancest', 'dpm']",
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+
+    args = parser.parse_args()
+
+    if args.original_config_file is None:
+        os.system(
+            "wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
+        )
+        args.original_config_file = "./v1-inference.yaml"
+
+    original_config = OmegaConf.load(args.original_config_file)
+
+
+    num_train_timesteps = original_config.model.params.timesteps
+    beta_start = original_config.model.params.linear_start
+    beta_end = original_config.model.params.linear_end
+    if args.scheduler_type == "pndm":
+        scheduler = PNDMScheduler(
+            beta_end=beta_end,
+            beta_schedule="scaled_linear",
+            beta_start=beta_start,
+            num_train_timesteps=num_train_timesteps,
+            skip_prk_steps=True,
+        )
+    elif args.scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
+    elif args.scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
+    elif args.scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler(
+            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
+        )
+    elif args.scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler(
+            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
+        )
+    elif args.scheduler_type == "ddim":
+        scheduler = DDIMScheduler(
+            beta_start=beta_start,
+            beta_end=beta_end,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+    else:
+        raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
+
+    # Convert the UNet2DConditionModel model.
+#    checkpoint = torch.load(args.unet_checkpoint_path)
+#    unet_config = create_unet_diffusers_config(original_config)
+#    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+#        checkpoint, unet_config, path=args.checkpoint_path, extract_ema=args.extract_ema
+#    )
+#
+#    unet = UNet2DConditionModel(**unet_config)
+#    unet.load_state_dict(converted_unet_checkpoint)
+
+    # Convert the VAE model.
+    if args.vae_checkpoint_path is not None:
+        vae_config = create_vae_diffusers_config(original_config)
+        checkpoint = torch.load(args.vae_checkpoint_path)
+        converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+
+        vae = AutoencoderKL(**vae_config)
+        vae.load_state_dict(converted_vae_checkpoint)
+        vae.save_pretrained(os.path.join(args.dump_path, "vae"))
+
+    # Convert the text model.
+#    text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
+#    if text_model_type == "FrozenCLIPEmbedder":
+#        text_model = convert_ldm_clip_checkpoint(checkpoint)
+#        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+#        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
+#        feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
+#        pipe = StableDiffusionPipeline(
+#            vae=vae,
+#            text_encoder=text_model,
+#            tokenizer=tokenizer,
+#            unet=unet,
+#            scheduler=scheduler,
+#            safety_checker=safety_checker,
+#            feature_extractor=feature_extractor,
+#        )
+#    else:
+#        text_config = create_ldm_bert_config(original_config)
+#        text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
+#        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
+#        pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+#
+#    pipe.save_pretrained(args.dump_path)
diff --git a/v1-inference.yaml b/v1-inference.yaml
new file mode 100644
index 000000000000..d4effe569e89
--- /dev/null
+++ b/v1-inference.yaml
@@ -0,0 +1,70 @@
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

From a8d0977769debad7c1071895f576f9413e2b967d Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Mon, 14 Nov 2022 22:03:10 +0100
Subject: [PATCH 184/269] [StableDiffusionInpaintPipeline] fix batch_size for
 mask and masked latents (#1279)

fix bs for mask and masked latents
---
 .../pipeline_stable_diffusion_inpaint.py      |  2 +-
 .../test_stable_diffusion_inpaint.py          | 41 +++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 332eb2ca777f..a122723eee6d 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -536,7 +536,7 @@ def __call__(
         mask, masked_image_latents = self.prepare_mask_latents(
             mask,
             masked_image,
-            batch_size,
+            batch_size * num_images_per_prompt,
             height,
             width,
             text_embeddings.dtype,
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 5fcdd71dd6e4..8d269c38f9f4 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -215,6 +215,47 @@ def test_stable_diffusion_inpaint(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
+    def test_stable_diffusion_inpaint_with_num_images_per_prompt(self):
+        device = "cpu"
+        unet = self.dummy_cond_unet_inpaint
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((128, 128))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionInpaintPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=None,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        images = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            image=init_image,
+            mask_image=mask_image,
+            num_images_per_prompt=2,
+        ).images
+
+        # check if the output is a list of 2 images
+        assert len(images) == 2
+
     @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
     def test_stable_diffusion_inpaint_fp16(self):
         """Test that stable diffusion inpaint_legacy works with fp16"""

From 7c5fef81e0aecff65c041a9dfb23aff22bf64f4b Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathan@huggingface.co>
Date: Mon, 14 Nov 2022 13:48:48 -0800
Subject: [PATCH 185/269] Add UNet 1d for RL model for planning + colab (#105)

* re-add RL model code

* match model forward api

* add register_to_config, pass training tests

* fix tests, update forward outputs

* remove unused code, some comments

* add to docs

* remove extra embedding code

* unify time embedding

* remove conv1d output sequential

* remove sequential from conv1dblock

* style and deleting duplicated code

* clean files

* remove unused variables

* clean variables

* add 1d resnet block structure for downsample

* rename as unet1d

* fix renaming

* rename files

* add get_block(...) api

* unify args for model1d like model2d

* minor cleaning

* fix docs

* improve 1d resnet blocks

* fix tests, remove permuts

* fix style

* add output activation

* rename flax blocks file

* Add Value Function and corresponding example script to Diffuser implementation (#884)

* valuefunction code

* start example scripts

* missing imports

* bug fixes and placeholder example script

* add value function scheduler

* load value function from hub and get best actions in example

* very close to working example

* larger batch size for planning

* more tests

* merge unet1d changes

* wandb for debugging, use newer models

* success!

* turns out we just need more diffusion steps

* run on modal

* merge and code cleanup

* use same api for rl model

* fix variance type

* wrong normalization function

* add tests

* style

* style and quality

* edits based on comments

* style and quality

* remove unused var

* hack unet1d into a value function

* add pipeline

* fix arg order

* add pipeline to core library

* community pipeline

* fix couple shape bugs

* style

* Apply suggestions from code review

Co-authored-by: Nathan Lambert <nathan@huggingface.co>

* update post merge of scripts

* add mdiblock / outblock architecture

* Pipeline cleanup (#947)

* valuefunction code

* start example scripts

* missing imports

* bug fixes and placeholder example script

* add value function scheduler

* load value function from hub and get best actions in example

* very close to working example

* larger batch size for planning

* more tests

* merge unet1d changes

* wandb for debugging, use newer models

* success!

* turns out we just need more diffusion steps

* run on modal

* merge and code cleanup

* use same api for rl model

* fix variance type

* wrong normalization function

* add tests

* style

* style and quality

* edits based on comments

* style and quality

* remove unused var

* hack unet1d into a value function

* add pipeline

* fix arg order

* add pipeline to core library

* community pipeline

* fix couple shape bugs

* style

* Apply suggestions from code review

* clean up comments

* convert older script to using pipeline and add readme

* rename scripts

* style, update tests

* delete unet rl model file

* remove imports in src

Co-authored-by: Nathan Lambert <nathan@huggingface.co>

* Update src/diffusers/models/unet_1d_blocks.py

* Update tests/test_models_unet.py

* RL Cleanup v2 (#965)

* valuefunction code

* start example scripts

* missing imports

* bug fixes and placeholder example script

* add value function scheduler

* load value function from hub and get best actions in example

* very close to working example

* larger batch size for planning

* more tests

* merge unet1d changes

* wandb for debugging, use newer models

* success!

* turns out we just need more diffusion steps

* run on modal

* merge and code cleanup

* use same api for rl model

* fix variance type

* wrong normalization function

* add tests

* style

* style and quality

* edits based on comments

* style and quality

* remove unused var

* hack unet1d into a value function

* add pipeline

* fix arg order

* add pipeline to core library

* community pipeline

* fix couple shape bugs

* style

* Apply suggestions from code review

* clean up comments

* convert older script to using pipeline and add readme

* rename scripts

* style, update tests

* delete unet rl model file

* remove imports in src

* add specific vf block and update tests

* style

* Update tests/test_models_unet.py

Co-authored-by: Nathan Lambert <nathan@huggingface.co>

* fix quality in tests

* fix quality style, split test file

* fix checks / tests

* make timesteps closer to main

* unify block API

* unify forward api

* delete lines in examples

* style

* examples style

* all tests pass

* make style

* make dance_diff test pass

* Refactoring RL PR (#1200)

* init file changes

* add import utils

* finish cleaning files, imports

* remove import flags

* clean examples

* fix imports, tests for merge

* update readmes

* hotfix for tests

* quality

* fix some tests

* change defaults

* more mps test fixes

* unet1d defaults

* do not default import experimental

* defaults for tests

* fix tests

* fix-copies

* fix

* changes per Patrik's comments (#1285)

* changes per Patrik's comments

* update conversion script

* fix renaming

* skip more mps tests

* last test fix

* Update examples/rl/README.md

Co-authored-by: Ben Glickenhaus <benglickenhaus@gmail.com>
---
 .gitignore                                    |   4 +-
 docs/source/api/models.mdx                    |   9 +-
 examples/README.md                            |   2 +-
 examples/rl/README.md                         |  19 +
 examples/rl/run_diffuser_gen_trajectories.py  |  57 +++
 examples/rl/run_diffuser_locomotion.py        |  57 +++
 .../convert_models_diffuser_to_diffusers.py   | 100 +++++
 src/diffusers/experimental/README.md          |   5 +
 src/diffusers/experimental/__init__.py        |   1 +
 src/diffusers/experimental/rl/__init__.py     |   1 +
 .../experimental/rl/value_guided_sampling.py  | 129 +++++++
 src/diffusers/models/embeddings.py            |  13 +-
 src/diffusers/models/resnet.py                | 138 ++++++-
 src/diffusers/models/unet_1d.py               | 115 ++++--
 src/diffusers/models/unet_1d_blocks.py        | 346 ++++++++++++++++--
 src/diffusers/schedulers/scheduling_ddpm.py   |   6 +-
 tests/models/test_models_unet_1d.py           | 235 +++++++++++-
 .../dance_diffusion/test_dance_diffusion.py   |   4 +
 18 files changed, 1176 insertions(+), 65 deletions(-)
 create mode 100644 examples/rl/README.md
 create mode 100644 examples/rl/run_diffuser_gen_trajectories.py
 create mode 100644 examples/rl/run_diffuser_locomotion.py
 create mode 100644 scripts/convert_models_diffuser_to_diffusers.py
 create mode 100644 src/diffusers/experimental/README.md
 create mode 100644 src/diffusers/experimental/__init__.py
 create mode 100644 src/diffusers/experimental/rl/__init__.py
 create mode 100644 src/diffusers/experimental/rl/value_guided_sampling.py

diff --git a/.gitignore b/.gitignore
index cf8183463613..f018a111ea33 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,4 +163,6 @@ tags
 *.lock
 
 # DS_Store (MacOS)
-.DS_Store
\ No newline at end of file
+.DS_Store
+# RL pipelines may produce mp4 outputs
+*.mp4
\ No newline at end of file
diff --git a/docs/source/api/models.mdx b/docs/source/api/models.mdx
index 2e1e8798a7ef..7c1faa847440 100644
--- a/docs/source/api/models.mdx
+++ b/docs/source/api/models.mdx
@@ -22,12 +22,15 @@ The models are built on the base class ['ModelMixin'] that is a `torch.nn.module
 ## UNet2DOutput
 [[autodoc]] models.unet_2d.UNet2DOutput
 
-## UNet1DModel
-[[autodoc]] UNet1DModel
-
 ## UNet2DModel
 [[autodoc]] UNet2DModel
 
+## UNet1DOutput
+[[autodoc]] models.unet_1d.UNet1DOutput
+
+## UNet1DModel
+[[autodoc]] UNet1DModel
+
 ## UNet2DConditionOutput
 [[autodoc]] models.unet_2d_condition.UNet2DConditionOutput
 
diff --git a/examples/README.md b/examples/README.md
index 29872a7a164e..06ce06b9e3bc 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -42,7 +42,7 @@ Training examples show how to pretrain or fine-tune diffusion models for a varie
 | [**Text-to-Image fine-tuning**](./text_to_image) | ✅ | ✅ | 
 | [**Textual Inversion**](./textual_inversion) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
 | [**Dreambooth**](./dreambooth) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb)
-
+| [**Reinforcement Learning for Control**](https://github.com/huggingface/diffusers/blob/main/examples/rl/run_diffusers_locomotion.py)                    | - | - | coming soon.
 
 ## Community
 
diff --git a/examples/rl/README.md b/examples/rl/README.md
new file mode 100644
index 000000000000..d68f2bf780e3
--- /dev/null
+++ b/examples/rl/README.md
@@ -0,0 +1,19 @@
+# Overview
+
+These examples show how to run (Diffuser)[https://arxiv.org/abs/2205.09991] in Diffusers. 
+There are four scripts, 
+1. `run_diffuser_locomotion.py` to sample actions and run them in the environment,
+2. and `run_diffuser_gen_trajectories.py` to just sample actions from the pre-trained diffusion model.
+
+You will need some RL specific requirements to run the examples:
+
+```
+pip install -f https://download.pytorch.org/whl/torch_stable.html \
+                free-mujoco-py \
+                einops \
+                gym==0.24.1 \
+                protobuf==3.20.1 \
+                git+https://github.com/rail-berkeley/d4rl.git \
+                mediapy \
+                Pillow==9.0.0
+```
diff --git a/examples/rl/run_diffuser_gen_trajectories.py b/examples/rl/run_diffuser_gen_trajectories.py
new file mode 100644
index 000000000000..5bb068cc9fc7
--- /dev/null
+++ b/examples/rl/run_diffuser_gen_trajectories.py
@@ -0,0 +1,57 @@
+import d4rl  # noqa
+import gym
+import tqdm
+from diffusers.experimental import ValueGuidedRLPipeline
+
+
+config = dict(
+    n_samples=64,
+    horizon=32,
+    num_inference_steps=20,
+    n_guide_steps=0,
+    scale_grad_by_std=True,
+    scale=0.1,
+    eta=0.0,
+    t_grad_cutoff=2,
+    device="cpu",
+)
+
+
+if __name__ == "__main__":
+    env_name = "hopper-medium-v2"
+    env = gym.make(env_name)
+
+    pipeline = ValueGuidedRLPipeline.from_pretrained(
+        "bglick13/hopper-medium-v2-value-function-hor32",
+        env=env,
+    )
+
+    env.seed(0)
+    obs = env.reset()
+    total_reward = 0
+    total_score = 0
+    T = 1000
+    rollout = [obs.copy()]
+    try:
+        for t in tqdm.tqdm(range(T)):
+            # Call the policy
+            denorm_actions = pipeline(obs, planning_horizon=32)
+
+            # execute action in environment
+            next_observation, reward, terminal, _ = env.step(denorm_actions)
+            score = env.get_normalized_score(total_reward)
+            # update return
+            total_reward += reward
+            total_score += score
+            print(
+                f"Step: {t}, Reward: {reward}, Total Reward: {total_reward}, Score: {score}, Total Score:"
+                f" {total_score}"
+            )
+            # save observations for rendering
+            rollout.append(next_observation.copy())
+
+            obs = next_observation
+    except KeyboardInterrupt:
+        pass
+
+    print(f"Total reward: {total_reward}")
diff --git a/examples/rl/run_diffuser_locomotion.py b/examples/rl/run_diffuser_locomotion.py
new file mode 100644
index 000000000000..e89181610b33
--- /dev/null
+++ b/examples/rl/run_diffuser_locomotion.py
@@ -0,0 +1,57 @@
+import d4rl  # noqa
+import gym
+import tqdm
+from diffusers.experimental import ValueGuidedRLPipeline
+
+
+config = dict(
+    n_samples=64,
+    horizon=32,
+    num_inference_steps=20,
+    n_guide_steps=2,
+    scale_grad_by_std=True,
+    scale=0.1,
+    eta=0.0,
+    t_grad_cutoff=2,
+    device="cpu",
+)
+
+
+if __name__ == "__main__":
+    env_name = "hopper-medium-v2"
+    env = gym.make(env_name)
+
+    pipeline = ValueGuidedRLPipeline.from_pretrained(
+        "bglick13/hopper-medium-v2-value-function-hor32",
+        env=env,
+    )
+
+    env.seed(0)
+    obs = env.reset()
+    total_reward = 0
+    total_score = 0
+    T = 1000
+    rollout = [obs.copy()]
+    try:
+        for t in tqdm.tqdm(range(T)):
+            # call the policy
+            denorm_actions = pipeline(obs, planning_horizon=32)
+
+            # execute action in environment
+            next_observation, reward, terminal, _ = env.step(denorm_actions)
+            score = env.get_normalized_score(total_reward)
+            # update return
+            total_reward += reward
+            total_score += score
+            print(
+                f"Step: {t}, Reward: {reward}, Total Reward: {total_reward}, Score: {score}, Total Score:"
+                f" {total_score}"
+            )
+            # save observations for rendering
+            rollout.append(next_observation.copy())
+
+            obs = next_observation
+    except KeyboardInterrupt:
+        pass
+
+    print(f"Total reward: {total_reward}")
diff --git a/scripts/convert_models_diffuser_to_diffusers.py b/scripts/convert_models_diffuser_to_diffusers.py
new file mode 100644
index 000000000000..9475f7da93fb
--- /dev/null
+++ b/scripts/convert_models_diffuser_to_diffusers.py
@@ -0,0 +1,100 @@
+import json
+import os
+
+import torch
+
+from diffusers import UNet1DModel
+
+
+os.makedirs("hub/hopper-medium-v2/unet/hor32", exist_ok=True)
+os.makedirs("hub/hopper-medium-v2/unet/hor128", exist_ok=True)
+
+os.makedirs("hub/hopper-medium-v2/value_function", exist_ok=True)
+
+
+def unet(hor):
+    if hor == 128:
+        down_block_types = ("DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D")
+        block_out_channels = (32, 128, 256)
+        up_block_types = ("UpResnetBlock1D", "UpResnetBlock1D")
+
+    elif hor == 32:
+        down_block_types = ("DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D")
+        block_out_channels = (32, 64, 128, 256)
+        up_block_types = ("UpResnetBlock1D", "UpResnetBlock1D", "UpResnetBlock1D")
+    model = torch.load(f"/Users/bglickenhaus/Documents/diffuser/temporal_unet-hopper-mediumv2-hor{hor}.torch")
+    state_dict = model.state_dict()
+    config = dict(
+        down_block_types=down_block_types,
+        block_out_channels=block_out_channels,
+        up_block_types=up_block_types,
+        layers_per_block=1,
+        use_timestep_embedding=True,
+        out_block_type="OutConv1DBlock",
+        norm_num_groups=8,
+        downsample_each_block=False,
+        in_channels=14,
+        out_channels=14,
+        extra_in_channels=0,
+        time_embedding_type="positional",
+        flip_sin_to_cos=False,
+        freq_shift=1,
+        sample_size=65536,
+        mid_block_type="MidResTemporalBlock1D",
+        act_fn="mish",
+    )
+    hf_value_function = UNet1DModel(**config)
+    print(f"length of state dict: {len(state_dict.keys())}")
+    print(f"length of value function dict: {len(hf_value_function.state_dict().keys())}")
+    mapping = dict((k, hfk) for k, hfk in zip(model.state_dict().keys(), hf_value_function.state_dict().keys()))
+    for k, v in mapping.items():
+        state_dict[v] = state_dict.pop(k)
+    hf_value_function.load_state_dict(state_dict)
+
+    torch.save(hf_value_function.state_dict(), f"hub/hopper-medium-v2/unet/hor{hor}/diffusion_pytorch_model.bin")
+    with open(f"hub/hopper-medium-v2/unet/hor{hor}/config.json", "w") as f:
+        json.dump(config, f)
+
+
+def value_function():
+    config = dict(
+        in_channels=14,
+        down_block_types=("DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D"),
+        up_block_types=(),
+        out_block_type="ValueFunction",
+        mid_block_type="ValueFunctionMidBlock1D",
+        block_out_channels=(32, 64, 128, 256),
+        layers_per_block=1,
+        downsample_each_block=True,
+        sample_size=65536,
+        out_channels=14,
+        extra_in_channels=0,
+        time_embedding_type="positional",
+        use_timestep_embedding=True,
+        flip_sin_to_cos=False,
+        freq_shift=1,
+        norm_num_groups=8,
+        act_fn="mish",
+    )
+
+    model = torch.load("/Users/bglickenhaus/Documents/diffuser/value_function-hopper-mediumv2-hor32.torch")
+    state_dict = model
+    hf_value_function = UNet1DModel(**config)
+    print(f"length of state dict: {len(state_dict.keys())}")
+    print(f"length of value function dict: {len(hf_value_function.state_dict().keys())}")
+
+    mapping = dict((k, hfk) for k, hfk in zip(state_dict.keys(), hf_value_function.state_dict().keys()))
+    for k, v in mapping.items():
+        state_dict[v] = state_dict.pop(k)
+
+    hf_value_function.load_state_dict(state_dict)
+
+    torch.save(hf_value_function.state_dict(), "hub/hopper-medium-v2/value_function/diffusion_pytorch_model.bin")
+    with open("hub/hopper-medium-v2/value_function/config.json", "w") as f:
+        json.dump(config, f)
+
+
+if __name__ == "__main__":
+    unet(32)
+    # unet(128)
+    value_function()
diff --git a/src/diffusers/experimental/README.md b/src/diffusers/experimental/README.md
new file mode 100644
index 000000000000..81a9de81c737
--- /dev/null
+++ b/src/diffusers/experimental/README.md
@@ -0,0 +1,5 @@
+# 🧨 Diffusers Experimental
+
+We are adding experimental code to support novel applications and usages of the Diffusers library.
+Currently, the following experiments are supported:
+* Reinforcement learning via an implementation of the [Diffuser](https://arxiv.org/abs/2205.09991) model.
\ No newline at end of file
diff --git a/src/diffusers/experimental/__init__.py b/src/diffusers/experimental/__init__.py
new file mode 100644
index 000000000000..ebc815540301
--- /dev/null
+++ b/src/diffusers/experimental/__init__.py
@@ -0,0 +1 @@
+from .rl import ValueGuidedRLPipeline
diff --git a/src/diffusers/experimental/rl/__init__.py b/src/diffusers/experimental/rl/__init__.py
new file mode 100644
index 000000000000..7b338d3173e1
--- /dev/null
+++ b/src/diffusers/experimental/rl/__init__.py
@@ -0,0 +1 @@
+from .value_guided_sampling import ValueGuidedRLPipeline
diff --git a/src/diffusers/experimental/rl/value_guided_sampling.py b/src/diffusers/experimental/rl/value_guided_sampling.py
new file mode 100644
index 000000000000..8d5062e3d4c5
--- /dev/null
+++ b/src/diffusers/experimental/rl/value_guided_sampling.py
@@ -0,0 +1,129 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import torch
+
+import tqdm
+
+from ...models.unet_1d import UNet1DModel
+from ...pipeline_utils import DiffusionPipeline
+from ...utils.dummy_pt_objects import DDPMScheduler
+
+
+class ValueGuidedRLPipeline(DiffusionPipeline):
+    def __init__(
+        self,
+        value_function: UNet1DModel,
+        unet: UNet1DModel,
+        scheduler: DDPMScheduler,
+        env,
+    ):
+        super().__init__()
+        self.value_function = value_function
+        self.unet = unet
+        self.scheduler = scheduler
+        self.env = env
+        self.data = env.get_dataset()
+        self.means = dict()
+        for key in self.data.keys():
+            try:
+                self.means[key] = self.data[key].mean()
+            except:
+                pass
+        self.stds = dict()
+        for key in self.data.keys():
+            try:
+                self.stds[key] = self.data[key].std()
+            except:
+                pass
+        self.state_dim = env.observation_space.shape[0]
+        self.action_dim = env.action_space.shape[0]
+
+    def normalize(self, x_in, key):
+        return (x_in - self.means[key]) / self.stds[key]
+
+    def de_normalize(self, x_in, key):
+        return x_in * self.stds[key] + self.means[key]
+
+    def to_torch(self, x_in):
+        if type(x_in) is dict:
+            return {k: self.to_torch(v) for k, v in x_in.items()}
+        elif torch.is_tensor(x_in):
+            return x_in.to(self.unet.device)
+        return torch.tensor(x_in, device=self.unet.device)
+
+    def reset_x0(self, x_in, cond, act_dim):
+        for key, val in cond.items():
+            x_in[:, key, act_dim:] = val.clone()
+        return x_in
+
+    def run_diffusion(self, x, conditions, n_guide_steps, scale):
+        batch_size = x.shape[0]
+        y = None
+        for i in tqdm.tqdm(self.scheduler.timesteps):
+            # create batch of timesteps to pass into model
+            timesteps = torch.full((batch_size,), i, device=self.unet.device, dtype=torch.long)
+            for _ in range(n_guide_steps):
+                with torch.enable_grad():
+                    x.requires_grad_()
+                    y = self.value_function(x.permute(0, 2, 1), timesteps).sample
+                    grad = torch.autograd.grad([y.sum()], [x])[0]
+
+                    posterior_variance = self.scheduler._get_variance(i)
+                    model_std = torch.exp(0.5 * posterior_variance)
+                    grad = model_std * grad
+                grad[timesteps < 2] = 0
+                x = x.detach()
+                x = x + scale * grad
+                x = self.reset_x0(x, conditions, self.action_dim)
+            prev_x = self.unet(x.permute(0, 2, 1), timesteps).sample.permute(0, 2, 1)
+            x = self.scheduler.step(prev_x, i, x, predict_epsilon=False)["prev_sample"]
+
+            # apply conditions to the trajectory
+            x = self.reset_x0(x, conditions, self.action_dim)
+            x = self.to_torch(x)
+        return x, y
+
+    def __call__(self, obs, batch_size=64, planning_horizon=32, n_guide_steps=2, scale=0.1):
+        # normalize the observations and create  batch dimension
+        obs = self.normalize(obs, "observations")
+        obs = obs[None].repeat(batch_size, axis=0)
+
+        conditions = {0: self.to_torch(obs)}
+        shape = (batch_size, planning_horizon, self.state_dim + self.action_dim)
+
+        # generate initial noise and apply our conditions (to make the trajectories start at current state)
+        x1 = torch.randn(shape, device=self.unet.device)
+        x = self.reset_x0(x1, conditions, self.action_dim)
+        x = self.to_torch(x)
+
+        # run the diffusion process
+        x, y = self.run_diffusion(x, conditions, n_guide_steps, scale)
+
+        # sort output trajectories by value
+        sorted_idx = y.argsort(0, descending=True).squeeze()
+        sorted_values = x[sorted_idx]
+        actions = sorted_values[:, :, : self.action_dim]
+        actions = actions.detach().cpu().numpy()
+        denorm_actions = self.de_normalize(actions, key="actions")
+
+        # select the action with the highest value
+        if y is not None:
+            selected_index = 0
+        else:
+            # if we didn't run value guiding, select a random action
+            selected_index = np.random.randint(0, batch_size)
+        denorm_actions = denorm_actions[selected_index, 0]
+        return denorm_actions
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index b09d43fc2edf..0221d891f171 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -62,14 +62,21 @@ def get_timestep_embedding(
 
 
 class TimestepEmbedding(nn.Module):
-    def __init__(self, channel: int, time_embed_dim: int, act_fn: str = "silu"):
+    def __init__(self, in_channels: int, time_embed_dim: int, act_fn: str = "silu", out_dim: int = None):
         super().__init__()
 
-        self.linear_1 = nn.Linear(channel, time_embed_dim)
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim)
         self.act = None
         if act_fn == "silu":
             self.act = nn.SiLU()
-        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim)
+        elif act_fn == "mish":
+            self.act = nn.Mish()
+
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out)
 
     def forward(self, sample):
         sample = self.linear_1(sample)
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index 7bb5416adf24..52d056ae96fb 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -5,6 +5,75 @@
 import torch.nn.functional as F
 
 
+class Upsample1D(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+
+    Parameters:
+            channels: channels in the inputs and outputs.
+            use_conv: a bool determining if a convolution is applied.
+            use_conv_transpose:
+            out_channels:
+    """
+
+    def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+
+        self.conv = None
+        if use_conv_transpose:
+            self.conv = nn.ConvTranspose1d(channels, self.out_channels, 4, 2, 1)
+        elif use_conv:
+            self.conv = nn.Conv1d(self.channels, self.out_channels, 3, padding=1)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.use_conv_transpose:
+            return self.conv(x)
+
+        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+
+        if self.use_conv:
+            x = self.conv(x)
+
+        return x
+
+
+class Downsample1D(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+
+    Parameters:
+        channels: channels in the inputs and outputs.
+        use_conv: a bool determining if a convolution is applied.
+        out_channels:
+        padding:
+    """
+
+    def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = 2
+        self.name = name
+
+        if use_conv:
+            self.conv = nn.Conv1d(self.channels, self.out_channels, 3, stride=stride, padding=padding)
+        else:
+            assert self.channels == self.out_channels
+            self.conv = nn.AvgPool1d(kernel_size=stride, stride=stride)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.conv(x)
+
+
 class Upsample2D(nn.Module):
     """
     An upsampling layer with an optional convolution.
@@ -12,7 +81,8 @@ class Upsample2D(nn.Module):
     Parameters:
         channels: channels in the inputs and outputs.
         use_conv: a bool determining if a convolution is applied.
-        dims: determines if the signal is 1D, 2D, or 3D. If 3D, then upsampling occurs in the inner-two dimensions.
+        use_conv_transpose:
+        out_channels:
     """
 
     def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
@@ -80,7 +150,8 @@ class Downsample2D(nn.Module):
     Parameters:
         channels: channels in the inputs and outputs.
         use_conv: a bool determining if a convolution is applied.
-        dims: determines if the signal is 1D, 2D, or 3D. If 3D, then downsampling occurs in the inner-two dimensions.
+        out_channels:
+        padding:
     """
 
     def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
@@ -415,6 +486,69 @@ def forward(self, hidden_states):
         return hidden_states * torch.tanh(torch.nn.functional.softplus(hidden_states))
 
 
+# unet_rl.py
+def rearrange_dims(tensor):
+    if len(tensor.shape) == 2:
+        return tensor[:, :, None]
+    if len(tensor.shape) == 3:
+        return tensor[:, :, None, :]
+    elif len(tensor.shape) == 4:
+        return tensor[:, :, 0, :]
+    else:
+        raise ValueError(f"`len(tensor)`: {len(tensor)} has to be 2, 3 or 4.")
+
+
+class Conv1dBlock(nn.Module):
+    """
+    Conv1d --> GroupNorm --> Mish
+    """
+
+    def __init__(self, inp_channels, out_channels, kernel_size, n_groups=8):
+        super().__init__()
+
+        self.conv1d = nn.Conv1d(inp_channels, out_channels, kernel_size, padding=kernel_size // 2)
+        self.group_norm = nn.GroupNorm(n_groups, out_channels)
+        self.mish = nn.Mish()
+
+    def forward(self, x):
+        x = self.conv1d(x)
+        x = rearrange_dims(x)
+        x = self.group_norm(x)
+        x = rearrange_dims(x)
+        x = self.mish(x)
+        return x
+
+
+# unet_rl.py
+class ResidualTemporalBlock1D(nn.Module):
+    def __init__(self, inp_channels, out_channels, embed_dim, kernel_size=5):
+        super().__init__()
+        self.conv_in = Conv1dBlock(inp_channels, out_channels, kernel_size)
+        self.conv_out = Conv1dBlock(out_channels, out_channels, kernel_size)
+
+        self.time_emb_act = nn.Mish()
+        self.time_emb = nn.Linear(embed_dim, out_channels)
+
+        self.residual_conv = (
+            nn.Conv1d(inp_channels, out_channels, 1) if inp_channels != out_channels else nn.Identity()
+        )
+
+    def forward(self, x, t):
+        """
+        Args:
+            x : [ batch_size x inp_channels x horizon ]
+            t : [ batch_size x embed_dim ]
+
+        returns:
+            out : [ batch_size x out_channels x horizon ]
+        """
+        t = self.time_emb_act(t)
+        t = self.time_emb(t)
+        out = self.conv_in(x) + rearrange_dims(t)
+        out = self.conv_out(out)
+        return out + self.residual_conv(x)
+
+
 def upsample_2d(hidden_states, kernel=None, factor=2, gain=1):
     r"""Upsample2D a batch of 2D images with the given filter.
     Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and upsamples each image with the given
diff --git a/src/diffusers/models/unet_1d.py b/src/diffusers/models/unet_1d.py
index cc0685deb933..29d1d707f55a 100644
--- a/src/diffusers/models/unet_1d.py
+++ b/src/diffusers/models/unet_1d.py
@@ -1,3 +1,17 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 
@@ -8,7 +22,7 @@
 from ..modeling_utils import ModelMixin
 from ..utils import BaseOutput
 from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
-from .unet_1d_blocks import get_down_block, get_mid_block, get_up_block
+from .unet_1d_blocks import get_down_block, get_mid_block, get_out_block, get_up_block
 
 
 @dataclass
@@ -30,11 +44,11 @@ class UNet1DModel(ModelMixin, ConfigMixin):
     implements for all the model (such as downloading or saving, etc.)
 
     Parameters:
-        sample_size (`int`, *optionl*): Default length of sample. Should be adaptable at runtime.
+        sample_size (`int`, *optional*): Default length of sample. Should be adaptable at runtime.
         in_channels (`int`, *optional*, defaults to 2): Number of channels in the input sample.
         out_channels (`int`, *optional*, defaults to 2): Number of channels in the output.
         time_embedding_type (`str`, *optional*, defaults to `"fourier"`): Type of time embedding to use.
-        freq_shift (`int`, *optional*, defaults to 0): Frequency shift for fourier time embedding.
+        freq_shift (`float`, *optional*, defaults to 0.0): Frequency shift for fourier time embedding.
         flip_sin_to_cos (`bool`, *optional*, defaults to :
             obj:`False`): Whether to flip sin to cos for fourier time embedding.
         down_block_types (`Tuple[str]`, *optional*, defaults to :
@@ -43,6 +57,13 @@ class UNet1DModel(ModelMixin, ConfigMixin):
             obj:`("UpBlock1D", "UpBlock1DNoSkip", "AttnUpBlock1D")`): Tuple of upsample block types.
         block_out_channels (`Tuple[int]`, *optional*, defaults to :
             obj:`(32, 32, 64)`): Tuple of block output channels.
+        mid_block_type (`str`, *optional*, defaults to "UNetMidBlock1D"): block type for middle of UNet.
+        out_block_type (`str`, *optional*, defaults to `None`): optional output processing of UNet.
+        act_fn (`str`, *optional*, defaults to None): optional activitation function in UNet blocks.
+        norm_num_groups (`int`, *optional*, defaults to 8): group norm member count in UNet blocks.
+        layers_per_block (`int`, *optional*, defaults to 1): added number of layers in a UNet block.
+        downsample_each_block (`int`, *optional*, defaults to False:
+            experimental feature for using a UNet without upsampling.
     """
 
     @register_to_config
@@ -54,16 +75,20 @@ def __init__(
         out_channels: int = 2,
         extra_in_channels: int = 0,
         time_embedding_type: str = "fourier",
-        freq_shift: int = 0,
         flip_sin_to_cos: bool = True,
         use_timestep_embedding: bool = False,
+        freq_shift: float = 0.0,
         down_block_types: Tuple[str] = ("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
-        mid_block_type: str = "UNetMidBlock1D",
         up_block_types: Tuple[str] = ("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
+        mid_block_type: Tuple[str] = "UNetMidBlock1D",
+        out_block_type: str = None,
         block_out_channels: Tuple[int] = (32, 32, 64),
+        act_fn: str = None,
+        norm_num_groups: int = 8,
+        layers_per_block: int = 1,
+        downsample_each_block: bool = False,
     ):
         super().__init__()
-
         self.sample_size = sample_size
 
         # time
@@ -73,12 +98,19 @@ def __init__(
             )
             timestep_input_dim = 2 * block_out_channels[0]
         elif time_embedding_type == "positional":
-            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            self.time_proj = Timesteps(
+                block_out_channels[0], flip_sin_to_cos=flip_sin_to_cos, downscale_freq_shift=freq_shift
+            )
             timestep_input_dim = block_out_channels[0]
 
         if use_timestep_embedding:
             time_embed_dim = block_out_channels[0] * 4
-            self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+            self.time_mlp = TimestepEmbedding(
+                in_channels=timestep_input_dim,
+                time_embed_dim=time_embed_dim,
+                act_fn=act_fn,
+                out_dim=block_out_channels[0],
+            )
 
         self.down_blocks = nn.ModuleList([])
         self.mid_block = None
@@ -94,38 +126,66 @@ def __init__(
             if i == 0:
                 input_channel += extra_in_channels
 
+            is_final_block = i == len(block_out_channels) - 1
+
             down_block = get_down_block(
                 down_block_type,
+                num_layers=layers_per_block,
                 in_channels=input_channel,
                 out_channels=output_channel,
+                temb_channels=block_out_channels[0],
+                add_downsample=not is_final_block or downsample_each_block,
             )
             self.down_blocks.append(down_block)
 
         # mid
         self.mid_block = get_mid_block(
-            mid_block_type=mid_block_type,
-            mid_channels=block_out_channels[-1],
+            mid_block_type,
             in_channels=block_out_channels[-1],
-            out_channels=None,
+            mid_channels=block_out_channels[-1],
+            out_channels=block_out_channels[-1],
+            embed_dim=block_out_channels[0],
+            num_layers=layers_per_block,
+            add_downsample=downsample_each_block,
         )
 
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
         output_channel = reversed_block_out_channels[0]
+        if out_block_type is None:
+            final_upsample_channels = out_channels
+        else:
+            final_upsample_channels = block_out_channels[0]
+
         for i, up_block_type in enumerate(up_block_types):
             prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i + 1] if i < len(up_block_types) - 1 else out_channels
+            output_channel = (
+                reversed_block_out_channels[i + 1] if i < len(up_block_types) - 1 else final_upsample_channels
+            )
+
+            is_final_block = i == len(block_out_channels) - 1
 
             up_block = get_up_block(
                 up_block_type,
+                num_layers=layers_per_block,
                 in_channels=prev_output_channel,
                 out_channels=output_channel,
+                temb_channels=block_out_channels[0],
+                add_upsample=not is_final_block,
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
 
-        # TODO(PVP, Nathan) placeholder for RL application to be merged shortly
-        # Totally fine to add another layer with a if statement - no need for nn.Identity here
+        # out
+        num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32)
+        self.out_block = get_out_block(
+            out_block_type=out_block_type,
+            num_groups_out=num_groups_out,
+            embed_dim=block_out_channels[0],
+            out_channels=out_channels,
+            act_fn=act_fn,
+            fc_dim=block_out_channels[-1] // 4,
+        )
 
     def forward(
         self,
@@ -144,12 +204,20 @@ def forward(
             [`~models.unet_1d.UNet1DOutput`] or `tuple`: [`~models.unet_1d.UNet1DOutput`] if `return_dict` is True,
             otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
         """
-        # 1. time
-        if len(timestep.shape) == 0:
-            timestep = timestep[None]
 
-        timestep_embed = self.time_proj(timestep)[..., None]
-        timestep_embed = timestep_embed.repeat([1, 1, sample.shape[2]]).to(sample.dtype)
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        timestep_embed = self.time_proj(timesteps)
+        if self.config.use_timestep_embedding:
+            timestep_embed = self.time_mlp(timestep_embed)
+        else:
+            timestep_embed = timestep_embed[..., None]
+            timestep_embed = timestep_embed.repeat([1, 1, sample.shape[2]]).to(sample.dtype)
 
         # 2. down
         down_block_res_samples = ()
@@ -158,13 +226,18 @@ def forward(
             down_block_res_samples += res_samples
 
         # 3. mid
-        sample = self.mid_block(sample)
+        if self.mid_block:
+            sample = self.mid_block(sample, timestep_embed)
 
         # 4. up
         for i, upsample_block in enumerate(self.up_blocks):
             res_samples = down_block_res_samples[-1:]
             down_block_res_samples = down_block_res_samples[:-1]
-            sample = upsample_block(sample, res_samples)
+            sample = upsample_block(sample, res_hidden_states_tuple=res_samples, temb=timestep_embed)
+
+        # 5. post-process
+        if self.out_block:
+            sample = self.out_block(sample, timestep_embed)
 
         if not return_dict:
             return (sample,)
diff --git a/src/diffusers/models/unet_1d_blocks.py b/src/diffusers/models/unet_1d_blocks.py
index 9009071d1e78..fc758ebbb044 100644
--- a/src/diffusers/models/unet_1d_blocks.py
+++ b/src/diffusers/models/unet_1d_blocks.py
@@ -17,6 +17,256 @@
 import torch.nn.functional as F
 from torch import nn
 
+from .resnet import Downsample1D, ResidualTemporalBlock1D, Upsample1D, rearrange_dims
+
+
+class DownResnetBlock1D(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels=None,
+        num_layers=1,
+        conv_shortcut=False,
+        temb_channels=32,
+        groups=32,
+        groups_out=None,
+        non_linearity=None,
+        time_embedding_norm="default",
+        output_scale_factor=1.0,
+        add_downsample=True,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.time_embedding_norm = time_embedding_norm
+        self.add_downsample = add_downsample
+        self.output_scale_factor = output_scale_factor
+
+        if groups_out is None:
+            groups_out = groups
+
+        # there will always be at least one resnet
+        resnets = [ResidualTemporalBlock1D(in_channels, out_channels, embed_dim=temb_channels)]
+
+        for _ in range(num_layers):
+            resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=temb_channels))
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if non_linearity == "swish":
+            self.nonlinearity = lambda x: F.silu(x)
+        elif non_linearity == "mish":
+            self.nonlinearity = nn.Mish()
+        elif non_linearity == "silu":
+            self.nonlinearity = nn.SiLU()
+        else:
+            self.nonlinearity = None
+
+        self.downsample = None
+        if add_downsample:
+            self.downsample = Downsample1D(out_channels, use_conv=True, padding=1)
+
+    def forward(self, hidden_states, temb=None):
+        output_states = ()
+
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for resnet in self.resnets[1:]:
+            hidden_states = resnet(hidden_states, temb)
+
+        output_states += (hidden_states,)
+
+        if self.nonlinearity is not None:
+            hidden_states = self.nonlinearity(hidden_states)
+
+        if self.downsample is not None:
+            hidden_states = self.downsample(hidden_states)
+
+        return hidden_states, output_states
+
+
+class UpResnetBlock1D(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels=None,
+        num_layers=1,
+        temb_channels=32,
+        groups=32,
+        groups_out=None,
+        non_linearity=None,
+        time_embedding_norm="default",
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.time_embedding_norm = time_embedding_norm
+        self.add_upsample = add_upsample
+        self.output_scale_factor = output_scale_factor
+
+        if groups_out is None:
+            groups_out = groups
+
+        # there will always be at least one resnet
+        resnets = [ResidualTemporalBlock1D(2 * in_channels, out_channels, embed_dim=temb_channels)]
+
+        for _ in range(num_layers):
+            resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=temb_channels))
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if non_linearity == "swish":
+            self.nonlinearity = lambda x: F.silu(x)
+        elif non_linearity == "mish":
+            self.nonlinearity = nn.Mish()
+        elif non_linearity == "silu":
+            self.nonlinearity = nn.SiLU()
+        else:
+            self.nonlinearity = None
+
+        self.upsample = None
+        if add_upsample:
+            self.upsample = Upsample1D(out_channels, use_conv_transpose=True)
+
+    def forward(self, hidden_states, res_hidden_states_tuple=None, temb=None):
+        if res_hidden_states_tuple is not None:
+            res_hidden_states = res_hidden_states_tuple[-1]
+            hidden_states = torch.cat((hidden_states, res_hidden_states), dim=1)
+
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for resnet in self.resnets[1:]:
+            hidden_states = resnet(hidden_states, temb)
+
+        if self.nonlinearity is not None:
+            hidden_states = self.nonlinearity(hidden_states)
+
+        if self.upsample is not None:
+            hidden_states = self.upsample(hidden_states)
+
+        return hidden_states
+
+
+class ValueFunctionMidBlock1D(nn.Module):
+    def __init__(self, in_channels, out_channels, embed_dim):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.embed_dim = embed_dim
+
+        self.res1 = ResidualTemporalBlock1D(in_channels, in_channels // 2, embed_dim=embed_dim)
+        self.down1 = Downsample1D(out_channels // 2, use_conv=True)
+        self.res2 = ResidualTemporalBlock1D(in_channels // 2, in_channels // 4, embed_dim=embed_dim)
+        self.down2 = Downsample1D(out_channels // 4, use_conv=True)
+
+    def forward(self, x, temb=None):
+        x = self.res1(x, temb)
+        x = self.down1(x)
+        x = self.res2(x, temb)
+        x = self.down2(x)
+        return x
+
+
+class MidResTemporalBlock1D(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        embed_dim,
+        num_layers: int = 1,
+        add_downsample: bool = False,
+        add_upsample: bool = False,
+        non_linearity=None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.add_downsample = add_downsample
+
+        # there will always be at least one resnet
+        resnets = [ResidualTemporalBlock1D(in_channels, out_channels, embed_dim=embed_dim)]
+
+        for _ in range(num_layers):
+            resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=embed_dim))
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if non_linearity == "swish":
+            self.nonlinearity = lambda x: F.silu(x)
+        elif non_linearity == "mish":
+            self.nonlinearity = nn.Mish()
+        elif non_linearity == "silu":
+            self.nonlinearity = nn.SiLU()
+        else:
+            self.nonlinearity = None
+
+        self.upsample = None
+        if add_upsample:
+            self.upsample = Downsample1D(out_channels, use_conv=True)
+
+        self.downsample = None
+        if add_downsample:
+            self.downsample = Downsample1D(out_channels, use_conv=True)
+
+        if self.upsample and self.downsample:
+            raise ValueError("Block cannot downsample and upsample")
+
+    def forward(self, hidden_states, temb):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for resnet in self.resnets[1:]:
+            hidden_states = resnet(hidden_states, temb)
+
+        if self.upsample:
+            hidden_states = self.upsample(hidden_states)
+        if self.downsample:
+            self.downsample = self.downsample(hidden_states)
+
+        return hidden_states
+
+
+class OutConv1DBlock(nn.Module):
+    def __init__(self, num_groups_out, out_channels, embed_dim, act_fn):
+        super().__init__()
+        self.final_conv1d_1 = nn.Conv1d(embed_dim, embed_dim, 5, padding=2)
+        self.final_conv1d_gn = nn.GroupNorm(num_groups_out, embed_dim)
+        if act_fn == "silu":
+            self.final_conv1d_act = nn.SiLU()
+        if act_fn == "mish":
+            self.final_conv1d_act = nn.Mish()
+        self.final_conv1d_2 = nn.Conv1d(embed_dim, out_channels, 1)
+
+    def forward(self, hidden_states, temb=None):
+        hidden_states = self.final_conv1d_1(hidden_states)
+        hidden_states = rearrange_dims(hidden_states)
+        hidden_states = self.final_conv1d_gn(hidden_states)
+        hidden_states = rearrange_dims(hidden_states)
+        hidden_states = self.final_conv1d_act(hidden_states)
+        hidden_states = self.final_conv1d_2(hidden_states)
+        return hidden_states
+
+
+class OutValueFunctionBlock(nn.Module):
+    def __init__(self, fc_dim, embed_dim):
+        super().__init__()
+        self.final_block = nn.ModuleList(
+            [
+                nn.Linear(fc_dim + embed_dim, fc_dim // 2),
+                nn.Mish(),
+                nn.Linear(fc_dim // 2, 1),
+            ]
+        )
+
+    def forward(self, hidden_states, temb):
+        hidden_states = hidden_states.view(hidden_states.shape[0], -1)
+        hidden_states = torch.cat((hidden_states, temb), dim=-1)
+        for layer in self.final_block:
+            hidden_states = layer(hidden_states)
+
+        return hidden_states
+
 
 _kernels = {
     "linear": [1 / 8, 3 / 8, 3 / 8, 1 / 8],
@@ -62,7 +312,7 @@ def __init__(self, kernel="linear", pad_mode="reflect"):
         self.pad = kernel_1d.shape[0] // 2 - 1
         self.register_buffer("kernel", kernel_1d)
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states, temb=None):
         hidden_states = F.pad(hidden_states, ((self.pad + 1) // 2,) * 2, self.pad_mode)
         weight = hidden_states.new_zeros([hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]])
         indices = torch.arange(hidden_states.shape[1], device=hidden_states.device)
@@ -162,32 +412,6 @@ def forward(self, hidden_states):
         return output
 
 
-def get_down_block(down_block_type, out_channels, in_channels):
-    if down_block_type == "DownBlock1D":
-        return DownBlock1D(out_channels=out_channels, in_channels=in_channels)
-    elif down_block_type == "AttnDownBlock1D":
-        return AttnDownBlock1D(out_channels=out_channels, in_channels=in_channels)
-    elif down_block_type == "DownBlock1DNoSkip":
-        return DownBlock1DNoSkip(out_channels=out_channels, in_channels=in_channels)
-    raise ValueError(f"{down_block_type} does not exist.")
-
-
-def get_up_block(up_block_type, in_channels, out_channels):
-    if up_block_type == "UpBlock1D":
-        return UpBlock1D(in_channels=in_channels, out_channels=out_channels)
-    elif up_block_type == "AttnUpBlock1D":
-        return AttnUpBlock1D(in_channels=in_channels, out_channels=out_channels)
-    elif up_block_type == "UpBlock1DNoSkip":
-        return UpBlock1DNoSkip(in_channels=in_channels, out_channels=out_channels)
-    raise ValueError(f"{up_block_type} does not exist.")
-
-
-def get_mid_block(mid_block_type, in_channels, mid_channels, out_channels):
-    if mid_block_type == "UNetMidBlock1D":
-        return UNetMidBlock1D(in_channels=in_channels, mid_channels=mid_channels, out_channels=out_channels)
-    raise ValueError(f"{mid_block_type} does not exist.")
-
-
 class UNetMidBlock1D(nn.Module):
     def __init__(self, mid_channels, in_channels, out_channels=None):
         super().__init__()
@@ -217,7 +441,7 @@ def __init__(self, mid_channels, in_channels, out_channels=None):
         self.attentions = nn.ModuleList(attentions)
         self.resnets = nn.ModuleList(resnets)
 
-    def forward(self, hidden_states):
+    def forward(self, hidden_states, temb=None):
         hidden_states = self.down(hidden_states)
         for attn, resnet in zip(self.attentions, self.resnets):
             hidden_states = resnet(hidden_states)
@@ -322,7 +546,7 @@ def __init__(self, in_channels, out_channels, mid_channels=None):
         self.resnets = nn.ModuleList(resnets)
         self.up = Upsample1d(kernel="cubic")
 
-    def forward(self, hidden_states, res_hidden_states_tuple):
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
         res_hidden_states = res_hidden_states_tuple[-1]
         hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
 
@@ -349,7 +573,7 @@ def __init__(self, in_channels, out_channels, mid_channels=None):
         self.resnets = nn.ModuleList(resnets)
         self.up = Upsample1d(kernel="cubic")
 
-    def forward(self, hidden_states, res_hidden_states_tuple):
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
         res_hidden_states = res_hidden_states_tuple[-1]
         hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
 
@@ -374,7 +598,7 @@ def __init__(self, in_channels, out_channels, mid_channels=None):
 
         self.resnets = nn.ModuleList(resnets)
 
-    def forward(self, hidden_states, res_hidden_states_tuple):
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
         res_hidden_states = res_hidden_states_tuple[-1]
         hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
 
@@ -382,3 +606,63 @@ def forward(self, hidden_states, res_hidden_states_tuple):
             hidden_states = resnet(hidden_states)
 
         return hidden_states
+
+
+def get_down_block(down_block_type, num_layers, in_channels, out_channels, temb_channels, add_downsample):
+    if down_block_type == "DownResnetBlock1D":
+        return DownResnetBlock1D(
+            in_channels=in_channels,
+            num_layers=num_layers,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+        )
+    elif down_block_type == "DownBlock1D":
+        return DownBlock1D(out_channels=out_channels, in_channels=in_channels)
+    elif down_block_type == "AttnDownBlock1D":
+        return AttnDownBlock1D(out_channels=out_channels, in_channels=in_channels)
+    elif down_block_type == "DownBlock1DNoSkip":
+        return DownBlock1DNoSkip(out_channels=out_channels, in_channels=in_channels)
+    raise ValueError(f"{down_block_type} does not exist.")
+
+
+def get_up_block(up_block_type, num_layers, in_channels, out_channels, temb_channels, add_upsample):
+    if up_block_type == "UpResnetBlock1D":
+        return UpResnetBlock1D(
+            in_channels=in_channels,
+            num_layers=num_layers,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+        )
+    elif up_block_type == "UpBlock1D":
+        return UpBlock1D(in_channels=in_channels, out_channels=out_channels)
+    elif up_block_type == "AttnUpBlock1D":
+        return AttnUpBlock1D(in_channels=in_channels, out_channels=out_channels)
+    elif up_block_type == "UpBlock1DNoSkip":
+        return UpBlock1DNoSkip(in_channels=in_channels, out_channels=out_channels)
+    raise ValueError(f"{up_block_type} does not exist.")
+
+
+def get_mid_block(mid_block_type, num_layers, in_channels, mid_channels, out_channels, embed_dim, add_downsample):
+    if mid_block_type == "MidResTemporalBlock1D":
+        return MidResTemporalBlock1D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            embed_dim=embed_dim,
+            add_downsample=add_downsample,
+        )
+    elif mid_block_type == "ValueFunctionMidBlock1D":
+        return ValueFunctionMidBlock1D(in_channels=in_channels, out_channels=out_channels, embed_dim=embed_dim)
+    elif mid_block_type == "UNetMidBlock1D":
+        return UNetMidBlock1D(in_channels=in_channels, mid_channels=mid_channels, out_channels=out_channels)
+    raise ValueError(f"{mid_block_type} does not exist.")
+
+
+def get_out_block(*, out_block_type, num_groups_out, embed_dim, out_channels, act_fn, fc_dim):
+    if out_block_type == "OutConv1DBlock":
+        return OutConv1DBlock(num_groups_out, out_channels, embed_dim, act_fn)
+    elif out_block_type == "ValueFunction":
+        return OutValueFunctionBlock(fc_dim, embed_dim)
+    return None
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index a19d91879cc3..c3e373d2bdca 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -204,6 +204,7 @@ def _get_variance(self, t, predicted_variance=None, variance_type=None):
         # for rl-diffuser https://arxiv.org/abs/2205.09991
         elif variance_type == "fixed_small_log":
             variance = torch.log(torch.clamp(variance, min=1e-20))
+            variance = torch.exp(0.5 * variance)
         elif variance_type == "fixed_large":
             variance = self.betas[t]
         elif variance_type == "fixed_large_log":
@@ -301,7 +302,10 @@ def step(
                 variance_noise = torch.randn(
                     model_output.shape, generator=generator, device=device, dtype=model_output.dtype
                 )
-            variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * variance_noise
+            if self.variance_type == "fixed_small_log":
+                variance = self._get_variance(t, predicted_variance=predicted_variance) * variance_noise
+            else:
+                variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * variance_noise
 
         pred_prev_sample = pred_prev_sample + variance
 
diff --git a/tests/models/test_models_unet_1d.py b/tests/models/test_models_unet_1d.py
index c274ce419211..41c4fdecfa0a 100644
--- a/tests/models/test_models_unet_1d.py
+++ b/tests/models/test_models_unet_1d.py
@@ -18,13 +18,120 @@
 import torch
 
 from diffusers import UNet1DModel
-from diffusers.utils import slow, torch_device
+from diffusers.utils import floats_tensor, slow, torch_device
+
+from ..test_modeling_common import ModelTesterMixin
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
 
 
-class UnetModel1DTests(unittest.TestCase):
+class UNet1DModelTests(ModelTesterMixin, unittest.TestCase):
+    model_class = UNet1DModel
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_features = 14
+        seq_len = 16
+
+        noise = floats_tensor((batch_size, num_features, seq_len)).to(torch_device)
+        time_step = torch.tensor([10] * batch_size).to(torch_device)
+
+        return {"sample": noise, "timestep": time_step}
+
+    @property
+    def input_shape(self):
+        return (4, 14, 16)
+
+    @property
+    def output_shape(self):
+        return (4, 14, 16)
+
+    def test_ema_training(self):
+        pass
+
+    def test_training(self):
+        pass
+
+    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
+    def test_determinism(self):
+        super().test_determinism()
+
+    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
+    def test_outputs_equivalence(self):
+        super().test_outputs_equivalence()
+
+    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
+    def test_from_pretrained_save_pretrained(self):
+        super().test_from_pretrained_save_pretrained()
+
+    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
+    def test_model_from_config(self):
+        super().test_model_from_config()
+
+    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
+    def test_output(self):
+        super().test_output()
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "block_out_channels": (32, 64, 128, 256),
+            "in_channels": 14,
+            "out_channels": 14,
+            "time_embedding_type": "positional",
+            "use_timestep_embedding": True,
+            "flip_sin_to_cos": False,
+            "freq_shift": 1.0,
+            "out_block_type": "OutConv1DBlock",
+            "mid_block_type": "MidResTemporalBlock1D",
+            "down_block_types": ("DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D"),
+            "up_block_types": ("UpResnetBlock1D", "UpResnetBlock1D", "UpResnetBlock1D"),
+            "act_fn": "mish",
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
+    def test_from_pretrained_hub(self):
+        model, loading_info = UNet1DModel.from_pretrained(
+            "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, subfolder="unet"
+        )
+        self.assertIsNotNone(model)
+        self.assertEqual(len(loading_info["missing_keys"]), 0)
+
+        model.to(torch_device)
+        image = model(**self.dummy_input)
+
+        assert image is not None, "Make sure output is not None"
+
+    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
+    def test_output_pretrained(self):
+        model = UNet1DModel.from_pretrained("bglick13/hopper-medium-v2-value-function-hor32", subfolder="unet")
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+
+        num_features = model.in_channels
+        seq_len = 16
+        noise = torch.randn((1, seq_len, num_features)).permute(
+            0, 2, 1
+        )  # match original, we can update values and remove
+        time_step = torch.full((num_features,), 0)
+
+        with torch.no_grad():
+            output = model(noise, time_step).sample.permute(0, 2, 1)
+
+        output_slice = output[0, -3:, -3:].flatten()
+        # fmt: off
+        expected_output_slice = torch.tensor([-2.137172, 1.1426016, 0.3688687, -0.766922, 0.7303146, 0.11038864, -0.4760633, 0.13270172, 0.02591348])
+        # fmt: on
+        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-3))
+
+    def test_forward_with_norm_groups(self):
+        # Not implemented yet for this UNet
+        pass
+
     @slow
     def test_unet_1d_maestro(self):
         model_id = "harmonai/maestro-150k"
@@ -43,3 +150,127 @@ def test_unet_1d_maestro(self):
 
         assert (output_sum - 224.0896).abs() < 4e-2
         assert (output_max - 0.0607).abs() < 4e-4
+
+
+class UNetRLModelTests(ModelTesterMixin, unittest.TestCase):
+    model_class = UNet1DModel
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_features = 14
+        seq_len = 16
+
+        noise = floats_tensor((batch_size, num_features, seq_len)).to(torch_device)
+        time_step = torch.tensor([10] * batch_size).to(torch_device)
+
+        return {"sample": noise, "timestep": time_step}
+
+    @property
+    def input_shape(self):
+        return (4, 14, 16)
+
+    @property
+    def output_shape(self):
+        return (4, 14, 1)
+
+    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
+    def test_determinism(self):
+        super().test_determinism()
+
+    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
+    def test_outputs_equivalence(self):
+        super().test_outputs_equivalence()
+
+    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
+    def test_from_pretrained_save_pretrained(self):
+        super().test_from_pretrained_save_pretrained()
+
+    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
+    def test_model_from_config(self):
+        super().test_model_from_config()
+
+    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
+    def test_output(self):
+        # UNetRL is a value-function is different output shape
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.sample
+
+        self.assertIsNotNone(output)
+        expected_shape = torch.Size((inputs_dict["sample"].shape[0], 1))
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    def test_ema_training(self):
+        pass
+
+    def test_training(self):
+        pass
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "in_channels": 14,
+            "out_channels": 14,
+            "down_block_types": ["DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D"],
+            "up_block_types": [],
+            "out_block_type": "ValueFunction",
+            "mid_block_type": "ValueFunctionMidBlock1D",
+            "block_out_channels": [32, 64, 128, 256],
+            "layers_per_block": 1,
+            "downsample_each_block": True,
+            "use_timestep_embedding": True,
+            "freq_shift": 1.0,
+            "flip_sin_to_cos": False,
+            "time_embedding_type": "positional",
+            "act_fn": "mish",
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
+    def test_from_pretrained_hub(self):
+        value_function, vf_loading_info = UNet1DModel.from_pretrained(
+            "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, subfolder="value_function"
+        )
+        self.assertIsNotNone(value_function)
+        self.assertEqual(len(vf_loading_info["missing_keys"]), 0)
+
+        value_function.to(torch_device)
+        image = value_function(**self.dummy_input)
+
+        assert image is not None, "Make sure output is not None"
+
+    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
+    def test_output_pretrained(self):
+        value_function, vf_loading_info = UNet1DModel.from_pretrained(
+            "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, subfolder="value_function"
+        )
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+
+        num_features = value_function.in_channels
+        seq_len = 14
+        noise = torch.randn((1, seq_len, num_features)).permute(
+            0, 2, 1
+        )  # match original, we can update values and remove
+        time_step = torch.full((num_features,), 0)
+
+        with torch.no_grad():
+            output = value_function(noise, time_step).sample
+
+        # fmt: off
+        expected_output_slice = torch.tensor([165.25] * seq_len)
+        # fmt: on
+        self.assertTrue(torch.allclose(output, expected_output_slice, rtol=1e-3))
+
+    def test_forward_with_norm_groups(self):
+        # Not implemented yet for this UNet
+        pass
diff --git a/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
index 72e67e4479d2..a63ef84c63f5 100644
--- a/tests/pipelines/dance_diffusion/test_dance_diffusion.py
+++ b/tests/pipelines/dance_diffusion/test_dance_diffusion.py
@@ -44,6 +44,10 @@ def dummy_unet(self):
             sample_rate=16_000,
             in_channels=2,
             out_channels=2,
+            flip_sin_to_cos=True,
+            use_timestep_embedding=False,
+            time_embedding_type="fourier",
+            mid_block_type="UNetMidBlock1D",
             down_block_types=["DownBlock1DNoSkip"] + ["DownBlock1D"] + ["AttnDownBlock1D"],
             up_block_types=["AttnUpBlock1D"] + ["UpBlock1D"] + ["UpBlock1DNoSkip"],
         )

From 57525bb41879d42fa8a0379b0c800011e5802277 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Mon, 14 Nov 2022 23:54:09 +0200
Subject: [PATCH 186/269] Fix documentation typo for `UNet2DModel` and
 `UNet2DConditionModel` (#1275)

* Fix documentation typo

* Fix other typo
---
 src/diffusers/models/unet_2d.py           | 2 +-
 src/diffusers/models/unet_2d_condition.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
index 641c253c86f8..04324057609e 100644
--- a/src/diffusers/models/unet_2d.py
+++ b/src/diffusers/models/unet_2d.py
@@ -51,7 +51,7 @@ class UNet2DModel(ModelMixin, ConfigMixin):
         time_embedding_type (`str`, *optional*, defaults to `"positional"`): Type of time embedding to use.
         freq_shift (`int`, *optional*, defaults to 0): Frequency shift for fourier time embedding.
         flip_sin_to_cos (`bool`, *optional*, defaults to :
-            obj:`False`): Whether to flip sin to cos for fourier time embedding.
+            obj:`True`): Whether to flip sin to cos for fourier time embedding.
         down_block_types (`Tuple[str]`, *optional*, defaults to :
             obj:`("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D")`): Tuple of downsample block
             types.
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 7f7f3ecd4435..becae7568343 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -60,7 +60,7 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin):
         in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
         out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
         center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
-        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
             Whether to flip the sin to cos in the time embedding.
         freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
         down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):

From 07f9e56d51e994d14e1e0d3330f1ee193df44f0d Mon Sep 17 00:00:00 2001
From: Nan Liu <33531451+nanliu1@users.noreply.github.com>
Date: Tue, 15 Nov 2022 03:19:06 -0600
Subject: [PATCH 187/269] add source link to composable diffusion model (#1293)

---
 examples/community/README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/community/README.md b/examples/community/README.md
index fd6fff79c576..5535937dca7d 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -15,7 +15,7 @@ If a community doesn't work as expected, please open an issue and ping the autho
 | Long Prompt Weighting Stable Diffusion | **One** Stable Diffusion Pipeline without tokens length limit, and support parsing weighting in prompt.                                                                                                                                                                                                                                                                                                                                                                                                  | [Long Prompt Weighting Stable Diffusion](#long-prompt-weighting-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [SkyTNT](https://github.com/SkyTNT) |
 | Speech to Image                        | Using automatic-speech-recognition to transcribe text and Stable Diffusion to generate images                                                                                                                                                                                                                                                                                                                                                                                                            | [Speech to Image](#speech-to-image)                               | -                                                                                                                                                                                                                  | [Mikail Duzenli](https://github.com/MikailINTech)
 | Wild Card Stable Diffusion | Stable Diffusion Pipeline that supports prompts that contain wildcard terms (indicated by surrounding double underscores), with values instantiated randomly from a corresponding txt file or a dictionary of possible values                                                                                                                                                                                                                                                                                                     | [Wildcard Stable Diffusion](#wildcard-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Shyam Sudhakaran](https://github.com/shyamsn97) |
-| Composable Stable Diffusion| Stable Diffusion Pipeline that supports prompts that contain "&#124;" in prompts (as an AND condition) and weights (separated by "&#124;" as well) to positively / negatively weight prompts.                                                                                                                                                                                                                                                                                                     | [Composable Stable Diffusion](#composable-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
+| [Composable Stable Diffusion](https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/) | Stable Diffusion Pipeline that supports prompts that contain "&#124;" in prompts (as an AND condition) and weights (separated by "&#124;" as well) to positively / negatively weight prompts.                                                                                                                                                                                                                                                                                                     | [Composable Stable Diffusion](#composable-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
 | Seed Resizing Stable Diffusion| Stable Diffusion Pipeline that supports resizing an image and retaining the concepts of the 512 by 512 generation.                                                                                                                                                                                                                                                                                                     | [Seed Resizing](#seed-resizing)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
 | Imagic Stable Diffusion | Stable Diffusion Pipeline that enables writing a text prompt to edit an existing image| [Imagic Stable Diffusion](#imagic-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
 | Multilingual Stable Diffusion| Stable Diffusion Pipeline that supports prompts in 50 different languages.                                                                                                                                                                                                                                                                                                     | [Multilingual Stable Diffusion](#multilingual-stable-diffusion-pipeline)                                                                 | -                                                                                                                                                                                                                  |                        [Juan Carlos Piñeros](https://github.com/juancopi81) |
@@ -345,6 +345,8 @@ out = pipe(
 
 ### Composable Stable diffusion 
 
+[Composable Stable Diffusion](https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/) proposes conjunction and negation (negative prompts) operators for compositional generation with conditional diffusion models.
+
 ```python
 import torch as th
 import numpy as np

From 610e2a6fd98126b5a2fc9bf223eae2cd7de5b032 Mon Sep 17 00:00:00 2001
From: Dhruv Naik <dhruvrnaik@gmail.com>
Date: Tue, 15 Nov 2022 04:19:35 -0500
Subject: [PATCH 188/269] Fix incorrect link to Stable Diffusion notebook
 (#1291)

Update README.md
---
 src/diffusers/pipelines/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/README.md b/src/diffusers/pipelines/README.md
index 2941660fa235..6ff40d3549b4 100644
--- a/src/diffusers/pipelines/README.md
+++ b/src/diffusers/pipelines/README.md
@@ -40,7 +40,7 @@ available a colab notebook to directly try them out.
 | [pndm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pndm)                                       | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778)                           | *Unconditional Image Generation* | 
 | [score_sde_ve](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/score_sde_ve)                       | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | *Unconditional Image Generation* | 
 | [score_sde_vp](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/score_sde_vp)                       | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | *Unconditional Image Generation* | 
-| [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | *Text-to-Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | *Text-to-Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb)
 | [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | *Image-to-Image Text-Guided Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
 | [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | *Text-Guided Image Inpainting* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
 | [stochastic_karras_ve](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stochastic_karras_ve)       | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364)                    | *Unconditional Image Generation* | 

From db1cb0b1a233cc6f4029261a67e503b322f31cd0 Mon Sep 17 00:00:00 2001
From: Glenn 'devalias' Grant <glenn@devalias.net>
Date: Tue, 15 Nov 2022 22:53:54 +1100
Subject: [PATCH 189/269] [dreambooth] link to bitsandbytes readme for
 installation  (#1229)

* add 'conda install cudatoolkit' to dreambooth 'training on 16GB' example

fixes https://github.com/huggingface/diffusers/issues/1207

* Apply suggestions from code review

Co-authored-by: Suraj Patil <surajp815@gmail.com>
---
 examples/dreambooth/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/dreambooth/README.md b/examples/dreambooth/README.md
index 3c9d04abc2d2..2339e2979dbd 100644
--- a/examples/dreambooth/README.md
+++ b/examples/dreambooth/README.md
@@ -92,7 +92,7 @@ accelerate launch train_dreambooth.py \
 
 With the help of gradient checkpointing and the 8-bit optimizer from bitsandbytes it's possible to run train dreambooth on a 16GB GPU.
 
-Install `bitsandbytes` with `pip install bitsandbytes`
+To install `bitandbytes` please refer to this [readme](https://github.com/TimDettmers/bitsandbytes#requirements--installation).
 
 ```bash
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"

From a0520193e15951655ee2c08c24bfdca716f6f64c Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 15 Nov 2022 18:15:13 +0100
Subject: [PATCH 190/269] Add Scheduler.from_pretrained and better scheduler
 changing (#1286)

* add conversion script for vae

* uP

* uP

* more changes

* push

* up

* finish again

* up

* up

* up

* up

* finish

* up

* uP

* up

* Apply suggestions from code review

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
Co-authored-by: Anton Lozhkov <anton@huggingface.co>
Co-authored-by: Suraj Patil <surajp815@gmail.com>

* up

* up

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
Co-authored-by: Anton Lozhkov <anton@huggingface.co>
Co-authored-by: Suraj Patil <surajp815@gmail.com>
---
 README.md                                     |  10 +-
 docs/source/_toctree.yml                      |   2 +
 docs/source/api/configuration.mdx             |   4 +-
 docs/source/api/pipelines/cycle_diffusion.mdx |   2 +-
 docs/source/api/pipelines/repaint.mdx         |   2 +-
 .../source/api/pipelines/stable_diffusion.mdx |  12 +-
 docs/source/quicktour.mdx                     |  25 +-
 docs/source/using-diffusers/loading.mdx       |  26 +-
 docs/source/using-diffusers/schedulers.mdx    | 262 ++++++++++++++++++
 src/diffusers/configuration_utils.py          | 222 ++++++++++-----
 src/diffusers/pipeline_flax_utils.py          |  10 +-
 src/diffusers/pipeline_utils.py               |  18 +-
 src/diffusers/pipelines/ddpm/pipeline_ddpm.py |   2 +-
 .../pipelines/stable_diffusion/README.md      |   6 +-
 src/diffusers/schedulers/scheduling_ddim.py   |  15 +-
 .../schedulers/scheduling_ddim_flax.py        |  13 +-
 src/diffusers/schedulers/scheduling_ddpm.py   |  17 +-
 .../schedulers/scheduling_ddpm_flax.py        |  15 +-
 .../scheduling_dpmsolver_multistep.py         |  14 +-
 .../scheduling_dpmsolver_multistep_flax.py    |  13 +-
 .../scheduling_euler_ancestral_discrete.py    |  15 +-
 .../schedulers/scheduling_euler_discrete.py   |  15 +-
 src/diffusers/schedulers/scheduling_ipndm.py  |   4 +-
 .../schedulers/scheduling_karras_ve.py        |   4 +-
 .../schedulers/scheduling_karras_ve_flax.py   |   4 +-
 .../schedulers/scheduling_lms_discrete.py     |  15 +-
 .../scheduling_lms_discrete_flax.py           |  13 +-
 src/diffusers/schedulers/scheduling_pndm.py   |  14 +-
 .../schedulers/scheduling_pndm_flax.py        |  13 +-
 .../schedulers/scheduling_repaint.py          |   4 +-
 src/diffusers/schedulers/scheduling_sde_ve.py |   4 +-
 .../schedulers/scheduling_sde_ve_flax.py      |   4 +-
 src/diffusers/schedulers/scheduling_sde_vp.py |   4 +-
 src/diffusers/schedulers/scheduling_utils.py  | 111 ++++++++
 .../schedulers/scheduling_utils_flax.py       | 121 +++++++-
 .../schedulers/scheduling_vq_diffusion.py     |   4 +-
 src/diffusers/utils/__init__.py               |  10 +
 tests/models/test_models_unet_1d.py           |   8 +-
 tests/pipelines/ddim/test_ddim.py             |   2 +-
 tests/pipelines/ddpm/test_ddpm.py             |   2 +-
 tests/pipelines/repaint/test_repaint.py       |   2 +-
 .../score_sde_ve/test_score_sde_ve.py         |   2 +-
 .../stable_diffusion/test_cycle_diffusion.py  |   4 +-
 .../test_onnx_stable_diffusion.py             |   4 +-
 .../test_onnx_stable_diffusion_img2img.py     |   2 +-
 .../test_onnx_stable_diffusion_inpaint.py     |   2 +-
 .../stable_diffusion/test_stable_diffusion.py |   4 +-
 .../test_stable_diffusion_img2img.py          |   6 +-
 .../test_stable_diffusion_inpaint.py          |   4 +-
 .../test_stable_diffusion_inpaint_legacy.py   |   2 +-
 tests/test_config.py                          | 131 +--------
 tests/test_modeling_common.py                 |   6 +-
 tests/test_pipelines.py                       |  82 +++++-
 tests/test_scheduler.py                       | 228 ++++++++++++++-
 tests/test_scheduler_flax.py                  |  16 +-
 55 files changed, 1149 insertions(+), 407 deletions(-)
 create mode 100644 docs/source/using-diffusers/schedulers.mdx

diff --git a/README.md b/README.md
index 64cbd15aab26..4a944d04590e 100644
--- a/README.md
+++ b/README.md
@@ -152,15 +152,7 @@ it before the pipeline and pass it to `from_pretrained`.
 ```python
 from diffusers import LMSDiscreteScheduler
 
-lms = LMSDiscreteScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
-
-pipe = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", 
-    revision="fp16", 
-    torch_dtype=torch.float16,
-    scheduler=lms,
-)
-pipe = pipe.to("cuda")
+pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
 
 prompt = "a photo of an astronaut riding a horse on mars"
 image = pipe(prompt).images[0]  
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index d8efb5eee3d3..0e8dec816774 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -10,6 +10,8 @@
   - sections:
     - local: using-diffusers/loading
       title: "Loading Pipelines, Models, and Schedulers"
+    - local: using-diffusers/schedulers
+      title: "Using different Schedulers"
     - local: using-diffusers/configuration
       title: "Configuring Pipelines, Models, and Schedulers"
     - local: using-diffusers/custom_pipeline_overview
diff --git a/docs/source/api/configuration.mdx b/docs/source/api/configuration.mdx
index 45176f55b018..423c31f462b6 100644
--- a/docs/source/api/configuration.mdx
+++ b/docs/source/api/configuration.mdx
@@ -15,9 +15,9 @@ specific language governing permissions and limitations under the License.
 In Diffusers, schedulers of type [`schedulers.scheduling_utils.SchedulerMixin`], and models of type [`ModelMixin`] inherit from [`ConfigMixin`] which conveniently takes care of storing all parameters that are 
 passed to the respective `__init__` methods in a JSON-configuration file.
 
-TODO(PVP) - add example and better info here
-
 ## ConfigMixin
+
 [[autodoc]] ConfigMixin
+	- load_config
 	- from_config
 	- save_config
diff --git a/docs/source/api/pipelines/cycle_diffusion.mdx b/docs/source/api/pipelines/cycle_diffusion.mdx
index 50d2a5c87eca..8eecd3d62494 100644
--- a/docs/source/api/pipelines/cycle_diffusion.mdx
+++ b/docs/source/api/pipelines/cycle_diffusion.mdx
@@ -39,7 +39,7 @@ from diffusers import CycleDiffusionPipeline, DDIMScheduler
 # load the pipeline
 # make sure you're logged in with `huggingface-cli login`
 model_id_or_path = "CompVis/stable-diffusion-v1-4"
-scheduler = DDIMScheduler.from_config(model_id_or_path, subfolder="scheduler")
+scheduler = DDIMScheduler.from_pretrained(model_id_or_path, subfolder="scheduler")
 pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path, scheduler=scheduler).to("cuda")
 
 # let's download an initial image
diff --git a/docs/source/api/pipelines/repaint.mdx b/docs/source/api/pipelines/repaint.mdx
index 0b7de8a4579d..ce262daffaeb 100644
--- a/docs/source/api/pipelines/repaint.mdx
+++ b/docs/source/api/pipelines/repaint.mdx
@@ -54,7 +54,7 @@ original_image = download_image(img_url).resize((256, 256))
 mask_image = download_image(mask_url).resize((256, 256))
 
 # Load the RePaint scheduler and pipeline based on a pretrained DDPM model
-scheduler = RePaintScheduler.from_config("google/ddpm-ema-celebahq-256")
+scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256")
 pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler)
 pipe = pipe.to("cuda")
 
diff --git a/docs/source/api/pipelines/stable_diffusion.mdx b/docs/source/api/pipelines/stable_diffusion.mdx
index 26d6a210adad..1d22024a5322 100644
--- a/docs/source/api/pipelines/stable_diffusion.mdx
+++ b/docs/source/api/pipelines/stable_diffusion.mdx
@@ -34,13 +34,17 @@ For more details about how Stable Diffusion works and how it differs from the ba
 ### How to load and use different schedulers.
 
 The stable diffusion pipeline uses [`PNDMScheduler`] scheduler by default. But `diffusers` provides many other schedulers that can be used with the stable diffusion pipeline such as [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`] etc.
-To use a different scheduler, you can pass the `scheduler` argument to `from_pretrained` method of the pipeline. For example, to use the [`EulerDiscreteScheduler`], you can do the following:
+To use a different scheduler, you can either change it via the [`ConfigMixin.from_config`] method or pass the `scheduler` argument to the `from_pretrained` method of the pipeline. For example, to use the [`EulerDiscreteScheduler`], you can do the following:
 
 ```python
-from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
+>>> from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
 
-euler_scheduler = EulerDiscreteScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
-pipeline = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=euler_scheduler)
+>>> pipeline = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+>>> pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
+
+>>> # or
+>>> euler_scheduler = EulerDiscreteScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
+>>> pipeline = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=euler_scheduler)
 ```
 
 
diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx
index 463780a0727f..a50b476c3d6a 100644
--- a/docs/source/quicktour.mdx
+++ b/docs/source/quicktour.mdx
@@ -41,7 +41,7 @@ In this guide though, you'll use [`DiffusionPipeline`] for text-to-image generat
 ```python
 >>> from diffusers import DiffusionPipeline
 
->>> generator = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
+>>> pipeline = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
 ```
 
 The [`DiffusionPipeline`] downloads and caches all modeling, tokenization, and scheduling components. 
@@ -49,13 +49,13 @@ Because the model consists of roughly 1.4 billion parameters, we strongly recomm
 You can move the generator object to GPU, just like you would in PyTorch.
 
 ```python
->>> generator.to("cuda")
+>>> pipeline.to("cuda")
 ```
 
-Now you can use the `generator` on your text prompt:
+Now you can use the `pipeline` on your text prompt:
 
 ```python
->>> image = generator("An image of a squirrel in Picasso style").images[0]
+>>> image = pipeline("An image of a squirrel in Picasso style").images[0]
 ```
 
 The output is by default wrapped into a [PIL Image object](https://pillow.readthedocs.io/en/stable/reference/Image.html?highlight=image#the-image-class).
@@ -82,7 +82,7 @@ just like we did before only that now you need to pass your `AUTH_TOKEN`:
 ```python
 >>> from diffusers import DiffusionPipeline
 
->>> generator = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_auth_token=AUTH_TOKEN)
+>>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_auth_token=AUTH_TOKEN)
 ```
 
 If you do not pass your authentication token you will see that the diffusion system will not be correctly 
@@ -102,7 +102,7 @@ token. Assuming that `"./stable-diffusion-v1-5"` is the local path to the cloned
 you can also load the pipeline as follows:
 
 ```python
->>> generator = DiffusionPipeline.from_pretrained("./stable-diffusion-v1-5")
+>>> pipeline = DiffusionPipeline.from_pretrained("./stable-diffusion-v1-5")
 ```
 
 Running the pipeline is then identical to the code above as it's the same model architecture.
@@ -115,19 +115,20 @@ Running the pipeline is then identical to the code above as it's the same model
 
 Diffusion systems can be used with multiple different [schedulers](./api/schedulers) each with their
 pros and cons. By default, Stable Diffusion runs with [`PNDMScheduler`], but it's very simple to 
-use a different scheduler. *E.g.* if you would instead like to use the [`LMSDiscreteScheduler`] scheduler,
+use a different scheduler. *E.g.* if you would instead like to use the [`EulerDiscreteScheduler`] scheduler,
 you could use it as follows:
 
 ```python
->>> from diffusers import LMSDiscreteScheduler
+>>> from diffusers import EulerDiscreteScheduler
 
->>> scheduler = LMSDiscreteScheduler.from_config("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
+>>> pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_auth_token=AUTH_TOKEN)
 
->>> generator = StableDiffusionPipeline.from_pretrained(
-...     "runwayml/stable-diffusion-v1-5", scheduler=scheduler, use_auth_token=AUTH_TOKEN
-... )
+>>> # change scheduler to Euler
+>>> pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
 ```
 
+For more in-detail information on how to change between schedulers, please refer to the [Using Schedulers](./using-diffusers/schedulers) guide.
+
 [Stability AI's](https://stability.ai/) Stable Diffusion model is an impressive image generation model
 and can do much more than just generating images from text. We have dedicated a whole documentation page,
 just for Stable Diffusion [here](./conceptual/stable_diffusion).
diff --git a/docs/source/using-diffusers/loading.mdx b/docs/source/using-diffusers/loading.mdx
index 2cb980ea618a..c97ad5c5d0c9 100644
--- a/docs/source/using-diffusers/loading.mdx
+++ b/docs/source/using-diffusers/loading.mdx
@@ -19,7 +19,7 @@ In the following we explain in-detail how to easily load:
 
 - *Complete Diffusion Pipelines* via the [`DiffusionPipeline.from_pretrained`]
 - *Diffusion Models* via [`ModelMixin.from_pretrained`]
-- *Schedulers* via [`ConfigMixin.from_config`]
+- *Schedulers* via [`SchedulerMixin.from_pretrained`]
 
 ## Loading pipelines
 
@@ -137,15 +137,15 @@ from diffusers import DiffusionPipeline, EulerDiscreteScheduler, DPMSolverMultis
 
 repo_id = "runwayml/stable-diffusion-v1-5"
 
-scheduler = EulerDiscreteScheduler.from_config(repo_id, subfolder="scheduler")
+scheduler = EulerDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
 # or
-# scheduler = DPMSolverMultistepScheduler.from_config(repo_id, subfolder="scheduler")
+# scheduler = DPMSolverMultistepScheduler.from_pretrained(repo_id, subfolder="scheduler")
 
 stable_diffusion = DiffusionPipeline.from_pretrained(repo_id, scheduler=scheduler)
 ```
 
 Three things are worth paying attention to here.
-- First, the scheduler is loaded with [`ConfigMixin.from_config`] since it only depends on a configuration file and not any parameterized weights
+- First, the scheduler is loaded with [`SchedulerMixin.from_pretrained`]
 - Second, the scheduler is loaded with a function argument, called `subfolder="scheduler"` as the configuration of stable diffusion's scheduling is defined in a [subfolder of the official pipeline repository](https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main/scheduler)
 - Third, the scheduler instance can simply be passed with the `scheduler` keyword argument to [`DiffusionPipeline.from_pretrained`]. This works because the [`StableDiffusionPipeline`] defines its scheduler with the `scheduler` attribute. It's not possible to use a different name, such as `sampler=scheduler` since `sampler` is not a defined keyword for [`StableDiffusionPipeline.__init__`]
 
@@ -337,8 +337,8 @@ model = UNet2DModel.from_pretrained(repo_id)
 
 ## Loading schedulers
 
-Schedulers cannot be loaded via a `from_pretrained` method, but instead rely on [`ConfigMixin.from_config`]. Schedulers are **not parameterized** or **trained**, but instead purely defined by a configuration file.
-Therefore the loading method was given a different name here.
+Schedulers rely on [`SchedulerMixin.from_pretrained`]. Schedulers are **not parameterized** or **trained**, but instead purely defined by a configuration file.
+For consistency, we use the same method name as we do for models or pipelines, but no weights are loaded in this case.
 
 In constrast to pipelines or models, loading schedulers does not consume any significant amount of memory and the same configuration file can often be used for a variety of different schedulers.
 For example, all of:
@@ -367,13 +367,13 @@ from diffusers import (
 
 repo_id = "runwayml/stable-diffusion-v1-5"
 
-ddpm = DDPMScheduler.from_config(repo_id, subfolder="scheduler")
-ddim = DDIMScheduler.from_config(repo_id, subfolder="scheduler")
-pndm = PNDMScheduler.from_config(repo_id, subfolder="scheduler")
-lms = LMSDiscreteScheduler.from_config(repo_id, subfolder="scheduler")
-euler_anc = EulerAncestralDiscreteScheduler.from_config(repo_id, subfolder="scheduler")
-euler = EulerDiscreteScheduler.from_config(repo_id, subfolder="scheduler")
-dpm = DPMSolverMultistepScheduler.from_config(repo_id, subfolder="scheduler")
+ddpm = DDPMScheduler.from_pretrained(repo_id, subfolder="scheduler")
+ddim = DDIMScheduler.from_pretrained(repo_id, subfolder="scheduler")
+pndm = PNDMScheduler.from_pretrained(repo_id, subfolder="scheduler")
+lms = LMSDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
+euler_anc = EulerAncestralDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
+euler = EulerDiscreteScheduler.from_pretrained(repo_id, subfolder="scheduler")
+dpm = DPMSolverMultistepScheduler.from_pretrained(repo_id, subfolder="scheduler")
 
 # replace `dpm` with any of `ddpm`, `ddim`, `pndm`, `lms`, `euler`, `euler_anc`
 pipeline = StableDiffusionPipeline.from_pretrained(repo_id, scheduler=dpm)
diff --git a/docs/source/using-diffusers/schedulers.mdx b/docs/source/using-diffusers/schedulers.mdx
new file mode 100644
index 000000000000..87ff789747a3
--- /dev/null
+++ b/docs/source/using-diffusers/schedulers.mdx
@@ -0,0 +1,262 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Schedulers
+
+Diffusion pipelines are inherently a collection of diffusion models and schedulers that are partly independent from each other. This means that one is able to switch out parts of the pipeline to better customize 
+a pipeline to one's use case. The best example of this are the [Schedulers](../api/schedulers.mdx).
+
+Whereas diffusion models usually simply define the forward pass from noise to a less noisy sample, 
+schedulers define the whole denoising process, *i.e.*:
+- How many denoising steps?
+- Stochastic or deterministic?
+- What algorithm to use to find the denoised sample
+
+They can be quite complex and often define a trade-off between **denoising speed** and **denoising quality**.
+It is extremely difficult to measure quantitatively which scheduler works best for a given diffusion pipeline, so it is often recommended to simply try out which works best.
+
+The following paragraphs shows how to do so with the 🧨 Diffusers library.
+
+## Load pipeline
+
+Let's start by loading the stable diffusion pipeline.
+Remember that you have to be a registered user on the 🤗 Hugging Face Hub, and have "click-accepted" the [license](https://huggingface.co/runwayml/stable-diffusion-v1-5) in order to use stable diffusion.
+
+```python
+from huggingface_hub import login
+from diffusers import DiffusionPipeline
+import torch
+
+# first we need to login with our access token
+login()
+
+# Now we can download the pipeline
+pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+```
+
+Next, we move it to GPU:
+
+```python
+pipeline.to("cuda")
+```
+
+## Access the scheduler
+
+The scheduler is always one of the components of the pipeline and is usually called `"scheduler"`.
+So it can be accessed via the `"scheduler"` property.
+
+```python
+pipeline.scheduler
+```
+
+**Output**:
+```
+PNDMScheduler {
+  "_class_name": "PNDMScheduler",
+  "_diffusers_version": "0.8.0.dev0",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "clip_sample": false,
+  "num_train_timesteps": 1000,
+  "set_alpha_to_one": false,
+  "skip_prk_steps": true,
+  "steps_offset": 1,
+  "trained_betas": null
+}
+```
+
+We can see that the scheduler is of type [`PNDMScheduler`]. 
+Cool, now let's compare the scheduler in its performance to other schedulers.
+First we define a prompt on which we will test all the different schedulers:
+
+```python
+prompt = "A photograph of an astronaut riding a horse on Mars, high resolution, high definition."
+```
+
+Next, we create a generator from a random seed that will ensure that we can generate similar images as well as run the pipeline:
+
+```python
+generator = torch.Generator(device="cuda").manual_seed(8)
+image = pipeline(prompt, generator=generator).images[0]
+image
+```
+
+<p align="center">
+    <br>
+    <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_pndm.png" width="400"/>
+    <br>
+</p>
+
+
+## Changing the scheduler
+
+Now we show how easy it is to change the scheduler of a pipeline. Every scheduler has a property [`SchedulerMixin.compatibles`] 
+which defines all compatible schedulers. You can take a look at all available, compatible schedulers for the Stable Diffusion pipeline as follows.
+
+```python
+pipeline.scheduler.compatibles
+```
+
+**Output**:
+```
+[diffusers.schedulers.scheduling_lms_discrete.LMSDiscreteScheduler,
+ diffusers.schedulers.scheduling_ddim.DDIMScheduler,
+ diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler,
+ diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler,
+ diffusers.schedulers.scheduling_pndm.PNDMScheduler,
+ diffusers.schedulers.scheduling_ddpm.DDPMScheduler,
+ diffusers.schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteScheduler]
+```
+
+Cool, lots of schedulers to look at. Feel free to have a look at their respective class definitions: 
+
+- [`LMSDiscreteScheduler`], 
+- [`DDIMScheduler`], 
+- [`DPMSolverMultistepScheduler`], 
+- [`EulerDiscreteScheduler`], 
+- [`PNDMScheduler`], 
+- [`DDPMScheduler`], 
+- [`EulerAncestralDiscreteScheduler`].
+
+We will now compare the input prompt with all other schedulers. To change the scheduler of the pipeline you can make use of the 
+convenient [`ConfigMixin.config`] property in combination with the [`ConfigMixin.from_config`] function.
+
+```python
+pipeline.scheduler.config
+```
+
+returns a dictionary of the configuration of the scheduler:
+
+**Output**:
+```
+FrozenDict([('num_train_timesteps', 1000),
+            ('beta_start', 0.00085),
+            ('beta_end', 0.012),
+            ('beta_schedule', 'scaled_linear'),
+            ('trained_betas', None),
+            ('skip_prk_steps', True),
+            ('set_alpha_to_one', False),
+            ('steps_offset', 1),
+            ('_class_name', 'PNDMScheduler'),
+            ('_diffusers_version', '0.8.0.dev0'),
+            ('clip_sample', False)])
+```
+
+This configuration can then be used to instantiate a scheduler
+of a different class that is compatible with the pipeline. Here, 
+we change the scheduler to the [`DDIMScheduler`].
+
+```python
+from diffusers import DDIMScheduler
+
+pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+```
+
+Cool, now we can run the pipeline again to compare the generation quality.
+
+```python
+generator = torch.Generator(device="cuda").manual_seed(8)
+image = pipeline(prompt, generator=generator).images[0]
+image
+```
+
+<p align="center">
+    <br>
+    <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_ddim.png" width="400"/>
+    <br>
+</p>
+
+
+## Compare schedulers
+
+So far we have tried running the stable diffusion pipeline with two schedulers: [`PNDMScheduler`] and [`DDIMScheduler`]. 
+A number of better schedulers have been released that can be run with much fewer steps, let's compare them here:
+
+[`LMSDiscreteScheduler`] usually leads to better results:
+
+```python
+from diffusers import LMSDiscreteScheduler
+
+pipeline.scheduler = LMSDiscreteScheduler.from_config(pipeline.scheduler.config)
+
+generator = torch.Generator(device="cuda").manual_seed(8)
+image = pipeline(prompt, generator=generator).images[0]
+image
+```
+
+<p align="center">
+    <br>
+    <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_lms.png" width="400"/>
+    <br>
+</p>
+
+
+[`EulerDiscreteScheduler`] and [`EulerAncestralDiscreteScheduler`] can generate high quality results with as little as 30 steps.
+
+```python
+from diffusers import EulerDiscreteScheduler
+
+pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
+
+generator = torch.Generator(device="cuda").manual_seed(8)
+image = pipeline(prompt, generator=generator, num_inference_steps=30).images[0]
+image
+```
+
+<p align="center">
+    <br>
+    <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_euler_discrete.png" width="400"/>
+    <br>
+</p>
+
+
+and:
+
+```python
+from diffusers import EulerAncestralDiscreteScheduler
+
+pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config)
+
+generator = torch.Generator(device="cuda").manual_seed(8)
+image = pipeline(prompt, generator=generator, num_inference_steps=30).images[0]
+image
+```
+
+<p align="center">
+    <br>
+    <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_euler_ancestral.png" width="400"/>
+    <br>
+</p>
+
+
+At the time of writing this doc [`DPMSolverMultistepScheduler`] gives arguably the best speed/quality trade-off and can be run with as little 
+as 20 steps.
+
+```python
+from diffusers import DPMSolverMultistepScheduler
+
+pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+
+generator = torch.Generator(device="cuda").manual_seed(8)
+image = pipeline(prompt, generator=generator, num_inference_steps=20).images[0]
+image
+```
+
+<p align="center">
+    <br>
+    <img src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_dpm.png" width="400"/>
+    <br>
+</p>
+
+As you can see most images look very similar and are arguably of very similar quality. It often really depends on the specific use case which scheduler to choose. A good approach is always to run multiple different
+schedulers to compare results.
diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py
index fc6ac9b5b97e..c4819ddc2ec6 100644
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -29,7 +29,7 @@
 from requests import HTTPError
 
 from . import __version__
-from .utils import DIFFUSERS_CACHE, HUGGINGFACE_CO_RESOLVE_ENDPOINT, logging
+from .utils import DIFFUSERS_CACHE, HUGGINGFACE_CO_RESOLVE_ENDPOINT, DummyObject, deprecate, logging
 
 
 logger = logging.get_logger(__name__)
@@ -37,6 +37,38 @@
 _re_configuration_file = re.compile(r"config\.(.*)\.json")
 
 
+class FrozenDict(OrderedDict):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        for key, value in self.items():
+            setattr(self, key, value)
+
+        self.__frozen = True
+
+    def __delitem__(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
+
+    def setdefault(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
+
+    def pop(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+
+    def update(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
+
+    def __setattr__(self, name, value):
+        if hasattr(self, "__frozen") and self.__frozen:
+            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
+        super().__setattr__(name, value)
+
+    def __setitem__(self, name, value):
+        if hasattr(self, "__frozen") and self.__frozen:
+            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
+        super().__setitem__(name, value)
+
+
 class ConfigMixin:
     r"""
     Base class for all configuration classes. Stores all configuration parameters under `self.config` Also handles all
@@ -49,13 +81,12 @@ class ConfigMixin:
           [`~ConfigMixin.save_config`] (should be overridden by parent class).
         - **ignore_for_config** (`List[str]`) -- A list of attributes that should not be saved in the config (should be
           overridden by parent class).
-        - **_compatible_classes** (`List[str]`) -- A list of classes that are compatible with the parent class, so that
-          `from_config` can be used from a class different than the one used to save the config (should be overridden
-          by parent class).
+        - **has_compatibles** (`bool`) -- Whether the class has compatible classes (should be overridden by parent
+          class).
     """
     config_name = None
     ignore_for_config = []
-    _compatible_classes = []
+    has_compatibles = False
 
     def register_to_config(self, **kwargs):
         if self.config_name is None:
@@ -104,9 +135,98 @@ def save_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool
         logger.info(f"Configuration saved in {output_config_file}")
 
     @classmethod
-    def from_config(cls, pretrained_model_name_or_path: Union[str, os.PathLike], return_unused_kwargs=False, **kwargs):
+    def from_config(cls, config: Union[FrozenDict, Dict[str, Any]] = None, return_unused_kwargs=False, **kwargs):
+        r"""
+        Instantiate a Python class from a config dictionary
+
+        Parameters:
+            config (`Dict[str, Any]`):
+                A config dictionary from which the Python class will be instantiated. Make sure to only load
+                configuration files of compatible classes.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                Whether kwargs that are not consumed by the Python class should be returned or not.
+
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to update the configuration object (after it being loaded) and initiate the Python class.
+                `**kwargs` will be directly passed to the underlying scheduler/model's `__init__` method and eventually
+                overwrite same named arguments of `config`.
+
+        Examples:
+
+        ```python
+        >>> from diffusers import DDPMScheduler, DDIMScheduler, PNDMScheduler
+
+        >>> # Download scheduler from huggingface.co and cache.
+        >>> scheduler = DDPMScheduler.from_pretrained("google/ddpm-cifar10-32")
+
+        >>> # Instantiate DDIM scheduler class with same config as DDPM
+        >>> scheduler = DDIMScheduler.from_config(scheduler.config)
+
+        >>> # Instantiate PNDM scheduler class with same config as DDPM
+        >>> scheduler = PNDMScheduler.from_config(scheduler.config)
+        ```
+        """
+        # <===== TO BE REMOVED WITH DEPRECATION
+        # TODO(Patrick) - make sure to remove the following lines when config=="model_path" is deprecated
+        if "pretrained_model_name_or_path" in kwargs:
+            config = kwargs.pop("pretrained_model_name_or_path")
+
+        if config is None:
+            raise ValueError("Please make sure to provide a config as the first positional argument.")
+        # ======>
+
+        if not isinstance(config, dict):
+            deprecation_message = "It is deprecated to pass a pretrained model name or path to `from_config`."
+            if "Scheduler" in cls.__name__:
+                deprecation_message += (
+                    f"If you were trying to load a scheduler, please use {cls}.from_pretrained(...) instead."
+                    " Otherwise, please make sure to pass a configuration dictionary instead. This functionality will"
+                    " be removed in v1.0.0."
+                )
+            elif "Model" in cls.__name__:
+                deprecation_message += (
+                    f"If you were trying to load a model, please use {cls}.load_config(...) followed by"
+                    f" {cls}.from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary"
+                    " instead. This functionality will be removed in v1.0.0."
+                )
+            deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+            config, kwargs = cls.load_config(pretrained_model_name_or_path=config, return_unused_kwargs=True, **kwargs)
+
+        init_dict, unused_kwargs, hidden_dict = cls.extract_init_dict(config, **kwargs)
+
+        # Allow dtype to be specified on initialization
+        if "dtype" in unused_kwargs:
+            init_dict["dtype"] = unused_kwargs.pop("dtype")
+
+        # Return model and optionally state and/or unused_kwargs
+        model = cls(**init_dict)
+
+        # make sure to also save config parameters that might be used for compatible classes
+        model.register_to_config(**hidden_dict)
+
+        # add hidden kwargs of compatible classes to unused_kwargs
+        unused_kwargs = {**unused_kwargs, **hidden_dict}
+
+        if return_unused_kwargs:
+            return (model, unused_kwargs)
+        else:
+            return model
+
+    @classmethod
+    def get_config_dict(cls, *args, **kwargs):
+        deprecation_message = (
+            f" The function get_config_dict is deprecated. Please use {cls}.load_config instead. This function will be"
+            " removed in version v1.0.0"
+        )
+        deprecate("get_config_dict", "1.0.0", deprecation_message, standard_warn=False)
+        return cls.load_config(*args, **kwargs)
+
+    @classmethod
+    def load_config(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], return_unused_kwargs=False, **kwargs
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         r"""
-        Instantiate a Python class from a pre-defined JSON-file.
+        Instantiate a Python class from a config dictionary
 
         Parameters:
             pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
@@ -120,10 +240,6 @@ def from_config(cls, pretrained_model_name_or_path: Union[str, os.PathLike], ret
             cache_dir (`Union[str, os.PathLike]`, *optional*):
                 Path to a directory in which a downloaded pretrained model configuration should be cached if the
                 standard cache should not be used.
-            ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
-                Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
-                as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
-                checkpoint with 3 labels).
             force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
@@ -161,33 +277,7 @@ def from_config(cls, pretrained_model_name_or_path: Union[str, os.PathLike], ret
         use this method in a firewalled environment.
 
         </Tip>
-
         """
-        config_dict = cls.get_config_dict(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs)
-        init_dict, unused_kwargs = cls.extract_init_dict(config_dict, **kwargs)
-
-        # Allow dtype to be specified on initialization
-        if "dtype" in unused_kwargs:
-            init_dict["dtype"] = unused_kwargs.pop("dtype")
-
-        # Return model and optionally state and/or unused_kwargs
-        model = cls(**init_dict)
-        return_tuple = (model,)
-
-        # Flax schedulers have a state, so return it.
-        if cls.__name__.startswith("Flax") and hasattr(model, "create_state") and getattr(model, "has_state", False):
-            state = model.create_state()
-            return_tuple += (state,)
-
-        if return_unused_kwargs:
-            return return_tuple + (unused_kwargs,)
-        else:
-            return return_tuple if len(return_tuple) > 1 else model
-
-    @classmethod
-    def get_config_dict(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
@@ -283,6 +373,9 @@ def get_config_dict(
         except (json.JSONDecodeError, UnicodeDecodeError):
             raise EnvironmentError(f"It looks like the config file at '{config_file}' is not a valid JSON file.")
 
+        if return_unused_kwargs:
+            return config_dict, kwargs
+
         return config_dict
 
     @staticmethod
@@ -291,6 +384,9 @@ def _get_init_keys(cls):
 
     @classmethod
     def extract_init_dict(cls, config_dict, **kwargs):
+        # 0. Copy origin config dict
+        original_dict = {k: v for k, v in config_dict.items()}
+
         # 1. Retrieve expected config attributes from __init__ signature
         expected_keys = cls._get_init_keys(cls)
         expected_keys.remove("self")
@@ -310,10 +406,11 @@ def extract_init_dict(cls, config_dict, **kwargs):
         # load diffusers library to import compatible and original scheduler
         diffusers_library = importlib.import_module(__name__.split(".")[0])
 
-        # remove attributes from compatible classes that orig cannot expect
-        compatible_classes = [getattr(diffusers_library, c, None) for c in cls._compatible_classes]
-        # filter out None potentially undefined dummy classes
-        compatible_classes = [c for c in compatible_classes if c is not None]
+        if cls.has_compatibles:
+            compatible_classes = [c for c in cls._get_compatibles() if not isinstance(c, DummyObject)]
+        else:
+            compatible_classes = []
+
         expected_keys_comp_cls = set()
         for c in compatible_classes:
             expected_keys_c = cls._get_init_keys(c)
@@ -364,7 +461,10 @@ def extract_init_dict(cls, config_dict, **kwargs):
         # 6. Define unused keyword arguments
         unused_kwargs = {**config_dict, **kwargs}
 
-        return init_dict, unused_kwargs
+        # 7. Define "hidden" config parameters that were saved for compatible classes
+        hidden_config_dict = {k: v for k, v in original_dict.items() if k not in init_dict and not k.startswith("_")}
+
+        return init_dict, unused_kwargs, hidden_config_dict
 
     @classmethod
     def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
@@ -377,6 +477,12 @@ def __repr__(self):
 
     @property
     def config(self) -> Dict[str, Any]:
+        """
+        Returns the config of the class as a frozen dictionary
+
+        Returns:
+            `Dict[str, Any]`: Config of the class.
+        """
         return self._internal_dict
 
     def to_json_string(self) -> str:
@@ -401,38 +507,6 @@ def to_json_file(self, json_file_path: Union[str, os.PathLike]):
             writer.write(self.to_json_string())
 
 
-class FrozenDict(OrderedDict):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        for key, value in self.items():
-            setattr(self, key, value)
-
-        self.__frozen = True
-
-    def __delitem__(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
-
-    def setdefault(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
-
-    def pop(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
-
-    def update(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
-
-    def __setattr__(self, name, value):
-        if hasattr(self, "__frozen") and self.__frozen:
-            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
-        super().__setattr__(name, value)
-
-    def __setitem__(self, name, value):
-        if hasattr(self, "__frozen") and self.__frozen:
-            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
-        super().__setitem__(name, value)
-
-
 def register_to_config(init):
     r"""
     Decorator to apply on the init of classes inheriting from [`ConfigMixin`] so that all the arguments are
diff --git a/src/diffusers/pipeline_flax_utils.py b/src/diffusers/pipeline_flax_utils.py
index 4c34e64f782a..54bb028139fb 100644
--- a/src/diffusers/pipeline_flax_utils.py
+++ b/src/diffusers/pipeline_flax_utils.py
@@ -47,7 +47,7 @@
 LOADABLE_CLASSES = {
     "diffusers": {
         "FlaxModelMixin": ["save_pretrained", "from_pretrained"],
-        "FlaxSchedulerMixin": ["save_config", "from_config"],
+        "FlaxSchedulerMixin": ["save_pretrained", "from_pretrained"],
         "FlaxDiffusionPipeline": ["save_pretrained", "from_pretrained"],
     },
     "transformers": {
@@ -280,7 +280,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         >>> from diffusers import FlaxDPMSolverMultistepScheduler
 
         >>> model_id = "runwayml/stable-diffusion-v1-5"
-        >>> sched, sched_state = FlaxDPMSolverMultistepScheduler.from_config(
+        >>> sched, sched_state = FlaxDPMSolverMultistepScheduler.from_pretrained(
         ...     model_id,
         ...     subfolder="scheduler",
         ... )
@@ -303,7 +303,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         # 1. Download the checkpoints and configs
         # use snapshot download here to get it working from from_pretrained
         if not os.path.isdir(pretrained_model_name_or_path):
-            config_dict = cls.get_config_dict(
+            config_dict = cls.load_config(
                 pretrained_model_name_or_path,
                 cache_dir=cache_dir,
                 resume_download=resume_download,
@@ -349,7 +349,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         else:
             cached_folder = pretrained_model_name_or_path
 
-        config_dict = cls.get_config_dict(cached_folder)
+        config_dict = cls.load_config(cached_folder)
 
         # 2. Load the pipeline class, if using custom module then load it from the hub
         # if we load from explicit class, let's use it
@@ -370,7 +370,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         expected_modules = set(inspect.signature(pipeline_class.__init__).parameters.keys())
         passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
 
-        init_dict, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
+        init_dict, _, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
 
         init_kwargs = {}
 
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index a194f3eb34d9..b4b1b9dd0c55 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -65,7 +65,7 @@
 LOADABLE_CLASSES = {
     "diffusers": {
         "ModelMixin": ["save_pretrained", "from_pretrained"],
-        "SchedulerMixin": ["save_config", "from_config"],
+        "SchedulerMixin": ["save_pretrained", "from_pretrained"],
         "DiffusionPipeline": ["save_pretrained", "from_pretrained"],
         "OnnxRuntimeModel": ["save_pretrained", "from_pretrained"],
     },
@@ -207,7 +207,7 @@ def to(self, torch_device: Optional[Union[str, torch.device]] = None):
         if torch_device is None:
             return self
 
-        module_names, _ = self.extract_init_dict(dict(self.config))
+        module_names, _, _ = self.extract_init_dict(dict(self.config))
         for name in module_names.keys():
             module = getattr(self, name)
             if isinstance(module, torch.nn.Module):
@@ -228,7 +228,7 @@ def device(self) -> torch.device:
         Returns:
             `torch.device`: The torch device on which the pipeline is located.
         """
-        module_names, _ = self.extract_init_dict(dict(self.config))
+        module_names, _, _ = self.extract_init_dict(dict(self.config))
         for name in module_names.keys():
             module = getattr(self, name)
             if isinstance(module, torch.nn.Module):
@@ -377,11 +377,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         >>> # of the documentation](https://huggingface.co/docs/hub/security-tokens)
         >>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 
-        >>> # Download pipeline, but overwrite scheduler
+        >>> # Use a different scheduler
         >>> from diffusers import LMSDiscreteScheduler
 
-        >>> scheduler = LMSDiscreteScheduler.from_config("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
-        >>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", scheduler=scheduler)
+        >>> scheduler = LMSDiscreteScheduler.from_config(pipeline.scheduler.config)
+        >>> pipeline.scheduler = scheduler
         ```
         """
         cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
@@ -428,7 +428,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         # 1. Download the checkpoints and configs
         # use snapshot download here to get it working from from_pretrained
         if not os.path.isdir(pretrained_model_name_or_path):
-            config_dict = cls.get_config_dict(
+            config_dict = cls.load_config(
                 pretrained_model_name_or_path,
                 cache_dir=cache_dir,
                 resume_download=resume_download,
@@ -474,7 +474,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         else:
             cached_folder = pretrained_model_name_or_path
 
-        config_dict = cls.get_config_dict(cached_folder)
+        config_dict = cls.load_config(cached_folder)
 
         # 2. Load the pipeline class, if using custom module then load it from the hub
         # if we load from explicit class, let's use it
@@ -513,7 +513,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         expected_modules = set(inspect.signature(pipeline_class.__init__).parameters.keys()) - set(["self"])
         passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
 
-        init_dict, unused_kwargs = pipeline_class.extract_init_dict(config_dict, **kwargs)
+        init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
 
         if len(unused_kwargs) > 0:
             logger.warning(f"Keyword arguments {unused_kwargs} not recognized.")
diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
index b7194664f4c4..c937a230031e 100644
--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -71,7 +71,7 @@ def __call__(
         """
         message = (
             "Please make sure to instantiate your scheduler with `predict_epsilon` instead. E.g. `scheduler ="
-            " DDPMScheduler.from_config(<model_id>, predict_epsilon=True)`."
+            " DDPMScheduler.from_pretrained(<model_id>, predict_epsilon=True)`."
         )
         predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
 
diff --git a/src/diffusers/pipelines/stable_diffusion/README.md b/src/diffusers/pipelines/stable_diffusion/README.md
index a76e4c668272..bc30be4a7b9d 100644
--- a/src/diffusers/pipelines/stable_diffusion/README.md
+++ b/src/diffusers/pipelines/stable_diffusion/README.md
@@ -72,7 +72,7 @@ image.save("astronaut_rides_horse.png")
 # make sure you're logged in with `huggingface-cli login`
 from diffusers import StableDiffusionPipeline, DDIMScheduler
 
-scheduler =  DDIMScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
+scheduler =  DDIMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
 
 pipe = StableDiffusionPipeline.from_pretrained(
     "runwayml/stable-diffusion-v1-5", 
@@ -91,7 +91,7 @@ image.save("astronaut_rides_horse.png")
 # make sure you're logged in with `huggingface-cli login`
 from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
 
-lms = LMSDiscreteScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
+lms = LMSDiscreteScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
 
 pipe = StableDiffusionPipeline.from_pretrained(
     "runwayml/stable-diffusion-v1-5", 
@@ -120,7 +120,7 @@ from diffusers import CycleDiffusionPipeline, DDIMScheduler
 # load the pipeline
 # make sure you're logged in with `huggingface-cli login`
 model_id_or_path = "CompVis/stable-diffusion-v1-4"
-scheduler = DDIMScheduler.from_config(model_id_or_path, subfolder="scheduler")
+scheduler = DDIMScheduler.from_pretrained(model_id_or_path, subfolder="scheduler")
 pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path, scheduler=scheduler).to("cuda")
 
 # let's download an initial image
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 75cef635d063..1326b503ed03 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -23,7 +23,7 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput
+from ..utils import _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS, BaseOutput
 from .scheduling_utils import SchedulerMixin
 
 
@@ -82,8 +82,8 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     For more details, see the original paper: https://arxiv.org/abs/2010.02502
 
@@ -109,14 +109,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
 
     """
 
-    _compatible_classes = [
-        "PNDMScheduler",
-        "DDPMScheduler",
-        "LMSDiscreteScheduler",
-        "EulerDiscreteScheduler",
-        "EulerAncestralDiscreteScheduler",
-        "DPMSolverMultistepScheduler",
-    ]
+    _compatibles = _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/schedulers/scheduling_ddim_flax.py b/src/diffusers/schedulers/scheduling_ddim_flax.py
index 590e3aac2e49..ceef96a4a95f 100644
--- a/src/diffusers/schedulers/scheduling_ddim_flax.py
+++ b/src/diffusers/schedulers/scheduling_ddim_flax.py
@@ -23,7 +23,12 @@
 import jax.numpy as jnp
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
+from .scheduling_utils_flax import (
+    _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS,
+    FlaxSchedulerMixin,
+    FlaxSchedulerOutput,
+    broadcast_to_shape_from_left,
+)
 
 
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> jnp.ndarray:
@@ -79,8 +84,8 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     For more details, see the original paper: https://arxiv.org/abs/2010.02502
 
@@ -105,6 +110,8 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
             stable diffusion.
     """
 
+    _compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
+
     @property
     def has_state(self):
         return True
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index c3e373d2bdca..299a06f4eb13 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -22,7 +22,7 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, FrozenDict, register_to_config
-from ..utils import BaseOutput, deprecate
+from ..utils import _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS, BaseOutput, deprecate
 from .scheduling_utils import SchedulerMixin
 
 
@@ -80,8 +80,8 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     For more details, see the original paper: https://arxiv.org/abs/2006.11239
 
@@ -104,14 +104,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
 
     """
 
-    _compatible_classes = [
-        "DDIMScheduler",
-        "PNDMScheduler",
-        "LMSDiscreteScheduler",
-        "EulerDiscreteScheduler",
-        "EulerAncestralDiscreteScheduler",
-        "DPMSolverMultistepScheduler",
-    ]
+    _compatibles = _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
 
     @register_to_config
     def __init__(
@@ -249,7 +242,7 @@ def step(
         """
         message = (
             "Please make sure to instantiate your scheduler with `predict_epsilon` instead. E.g. `scheduler ="
-            " DDPMScheduler.from_config(<model_id>, predict_epsilon=True)`."
+            " DDPMScheduler.from_pretrained(<model_id>, predict_epsilon=True)`."
         )
         predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
         if predict_epsilon is not None and predict_epsilon != self.config.predict_epsilon:
diff --git a/src/diffusers/schedulers/scheduling_ddpm_flax.py b/src/diffusers/schedulers/scheduling_ddpm_flax.py
index f1b04a04176e..480cbda73c65 100644
--- a/src/diffusers/schedulers/scheduling_ddpm_flax.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_flax.py
@@ -24,7 +24,12 @@
 
 from ..configuration_utils import ConfigMixin, FrozenDict, register_to_config
 from ..utils import deprecate
-from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
+from .scheduling_utils_flax import (
+    _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS,
+    FlaxSchedulerMixin,
+    FlaxSchedulerOutput,
+    broadcast_to_shape_from_left,
+)
 
 
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> jnp.ndarray:
@@ -79,8 +84,8 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     For more details, see the original paper: https://arxiv.org/abs/2006.11239
 
@@ -103,6 +108,8 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
 
     """
 
+    _compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
+
     @property
     def has_state(self):
         return True
@@ -221,7 +228,7 @@ def step(
         """
         message = (
             "Please make sure to instantiate your scheduler with `predict_epsilon` instead. E.g. `scheduler ="
-            " DDPMScheduler.from_config(<model_id>, predict_epsilon=True)`."
+            " DDPMScheduler.from_pretrained(<model_id>, predict_epsilon=True)`."
         )
         predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
         if predict_epsilon is not None and predict_epsilon != self.config.predict_epsilon:
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
index d166354809b0..472b24637dcf 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -21,6 +21,7 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS
 from .scheduling_utils import SchedulerMixin, SchedulerOutput
 
 
@@ -71,8 +72,8 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     Args:
         num_train_timesteps (`int`): number of diffusion steps used to train the model.
@@ -116,14 +117,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
 
     """
 
-    _compatible_classes = [
-        "DDIMScheduler",
-        "DDPMScheduler",
-        "PNDMScheduler",
-        "LMSDiscreteScheduler",
-        "EulerDiscreteScheduler",
-        "EulerAncestralDiscreteScheduler",
-    ]
+    _compatibles = _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
index c9a6d1cd5c0b..d6fa3835346b 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
@@ -23,7 +23,12 @@
 import jax.numpy as jnp
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
+from .scheduling_utils_flax import (
+    _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS,
+    FlaxSchedulerMixin,
+    FlaxSchedulerOutput,
+    broadcast_to_shape_from_left,
+)
 
 
 def betas_for_alpha_bar(num_diffusion_timesteps: int, max_beta=0.999) -> jnp.ndarray:
@@ -96,8 +101,8 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     For more details, see the original paper: https://arxiv.org/abs/2206.00927 and https://arxiv.org/abs/2211.01095
 
@@ -143,6 +148,8 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
 
     """
 
+    _compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
+
     @property
     def has_state(self):
         return True
diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
index 621b5c17c0e9..f3abf017d9e5 100644
--- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -19,7 +19,7 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, logging
+from ..utils import _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS, BaseOutput, logging
 from .scheduling_utils import SchedulerMixin
 
 
@@ -52,8 +52,8 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     Args:
         num_train_timesteps (`int`): number of diffusion steps used to train the model.
@@ -67,14 +67,7 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
 
     """
 
-    _compatible_classes = [
-        "DDIMScheduler",
-        "DDPMScheduler",
-        "LMSDiscreteScheduler",
-        "PNDMScheduler",
-        "EulerDiscreteScheduler",
-        "DPMSolverMultistepScheduler",
-    ]
+    _compatibles = _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 2f9e938474ef..d9991bc3a097 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -19,7 +19,7 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, logging
+from ..utils import _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS, BaseOutput, logging
 from .scheduling_utils import SchedulerMixin
 
 
@@ -53,8 +53,8 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     Args:
         num_train_timesteps (`int`): number of diffusion steps used to train the model.
@@ -68,14 +68,7 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
 
     """
 
-    _compatible_classes = [
-        "DDIMScheduler",
-        "DDPMScheduler",
-        "LMSDiscreteScheduler",
-        "PNDMScheduler",
-        "EulerAncestralDiscreteScheduler",
-        "DPMSolverMultistepScheduler",
-    ]
+    _compatibles = _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/schedulers/scheduling_ipndm.py b/src/diffusers/schedulers/scheduling_ipndm.py
index fb413a280530..e5495713a834 100644
--- a/src/diffusers/schedulers/scheduling_ipndm.py
+++ b/src/diffusers/schedulers/scheduling_ipndm.py
@@ -28,8 +28,8 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     For more details, see the original paper: https://arxiv.org/abs/2202.09778
 
diff --git a/src/diffusers/schedulers/scheduling_karras_ve.py b/src/diffusers/schedulers/scheduling_karras_ve.py
index 743f2e061c53..b2eb332aed0f 100644
--- a/src/diffusers/schedulers/scheduling_karras_ve.py
+++ b/src/diffusers/schedulers/scheduling_karras_ve.py
@@ -56,8 +56,8 @@ class KarrasVeScheduler(SchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     For more details on the parameters, see the original paper's Appendix E.: "Elucidating the Design Space of
     Diffusion-Based Generative Models." https://arxiv.org/abs/2206.00364. The grid search values used to find the
diff --git a/src/diffusers/schedulers/scheduling_karras_ve_flax.py b/src/diffusers/schedulers/scheduling_karras_ve_flax.py
index 78ab007954aa..c4e612c3cc84 100644
--- a/src/diffusers/schedulers/scheduling_karras_ve_flax.py
+++ b/src/diffusers/schedulers/scheduling_karras_ve_flax.py
@@ -67,8 +67,8 @@ class FlaxKarrasVeScheduler(FlaxSchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     For more details on the parameters, see the original paper's Appendix E.: "Elucidating the Design Space of
     Diffusion-Based Generative Models." https://arxiv.org/abs/2206.00364. The grid search values used to find the
diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py
index 373c373ee0fe..8a9aedb41bf1 100644
--- a/src/diffusers/schedulers/scheduling_lms_discrete.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete.py
@@ -21,7 +21,7 @@
 from scipy import integrate
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput
+from ..utils import _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS, BaseOutput
 from .scheduling_utils import SchedulerMixin
 
 
@@ -52,8 +52,8 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     Args:
         num_train_timesteps (`int`): number of diffusion steps used to train the model.
@@ -67,14 +67,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
 
     """
 
-    _compatible_classes = [
-        "DDIMScheduler",
-        "DDPMScheduler",
-        "PNDMScheduler",
-        "EulerDiscreteScheduler",
-        "EulerAncestralDiscreteScheduler",
-        "DPMSolverMultistepScheduler",
-    ]
+    _compatibles = _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/schedulers/scheduling_lms_discrete_flax.py b/src/diffusers/schedulers/scheduling_lms_discrete_flax.py
index 20982d38aa40..21f25f72facf 100644
--- a/src/diffusers/schedulers/scheduling_lms_discrete_flax.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete_flax.py
@@ -20,7 +20,12 @@
 from scipy import integrate
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
+from .scheduling_utils_flax import (
+    _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS,
+    FlaxSchedulerMixin,
+    FlaxSchedulerOutput,
+    broadcast_to_shape_from_left,
+)
 
 
 @flax.struct.dataclass
@@ -49,8 +54,8 @@ class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     Args:
         num_train_timesteps (`int`): number of diffusion steps used to train the model.
@@ -63,6 +68,8 @@ class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
             option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
     """
 
+    _compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
+
     @property
     def has_state(self):
         return True
diff --git a/src/diffusers/schedulers/scheduling_pndm.py b/src/diffusers/schedulers/scheduling_pndm.py
index eec18af8d382..8bf0a5958266 100644
--- a/src/diffusers/schedulers/scheduling_pndm.py
+++ b/src/diffusers/schedulers/scheduling_pndm.py
@@ -21,6 +21,7 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS
 from .scheduling_utils import SchedulerMixin, SchedulerOutput
 
 
@@ -60,8 +61,8 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     For more details, see the original paper: https://arxiv.org/abs/2202.09778
 
@@ -88,14 +89,7 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin):
 
     """
 
-    _compatible_classes = [
-        "DDIMScheduler",
-        "DDPMScheduler",
-        "LMSDiscreteScheduler",
-        "EulerDiscreteScheduler",
-        "EulerAncestralDiscreteScheduler",
-        "DPMSolverMultistepScheduler",
-    ]
+    _compatibles = _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/schedulers/scheduling_pndm_flax.py b/src/diffusers/schedulers/scheduling_pndm_flax.py
index 357ecfe046c1..298e62de20d1 100644
--- a/src/diffusers/schedulers/scheduling_pndm_flax.py
+++ b/src/diffusers/schedulers/scheduling_pndm_flax.py
@@ -23,7 +23,12 @@
 import jax.numpy as jnp
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
+from .scheduling_utils_flax import (
+    _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS,
+    FlaxSchedulerMixin,
+    FlaxSchedulerOutput,
+    broadcast_to_shape_from_left,
+)
 
 
 def betas_for_alpha_bar(num_diffusion_timesteps: int, max_beta=0.999) -> jnp.ndarray:
@@ -87,8 +92,8 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     For more details, see the original paper: https://arxiv.org/abs/2202.09778
 
@@ -114,6 +119,8 @@ class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin):
             stable diffusion.
     """
 
+    _compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
+
     @property
     def has_state(self):
         return True
diff --git a/src/diffusers/schedulers/scheduling_repaint.py b/src/diffusers/schedulers/scheduling_repaint.py
index 1751f41c66f2..55625c1bfa92 100644
--- a/src/diffusers/schedulers/scheduling_repaint.py
+++ b/src/diffusers/schedulers/scheduling_repaint.py
@@ -77,8 +77,8 @@ class RePaintScheduler(SchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     For more details, see the original paper: https://arxiv.org/pdf/2201.09865.pdf
 
diff --git a/src/diffusers/schedulers/scheduling_sde_ve.py b/src/diffusers/schedulers/scheduling_sde_ve.py
index d31adbc3c673..1d436ab0cbbf 100644
--- a/src/diffusers/schedulers/scheduling_sde_ve.py
+++ b/src/diffusers/schedulers/scheduling_sde_ve.py
@@ -50,8 +50,8 @@ class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     Args:
         num_train_timesteps (`int`): number of diffusion steps used to train the model.
diff --git a/src/diffusers/schedulers/scheduling_sde_ve_flax.py b/src/diffusers/schedulers/scheduling_sde_ve_flax.py
index d3eadede61b5..d1f762bc90c4 100644
--- a/src/diffusers/schedulers/scheduling_sde_ve_flax.py
+++ b/src/diffusers/schedulers/scheduling_sde_ve_flax.py
@@ -64,8 +64,8 @@ class FlaxScoreSdeVeScheduler(FlaxSchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     Args:
         num_train_timesteps (`int`): number of diffusion steps used to train the model.
diff --git a/src/diffusers/schedulers/scheduling_sde_vp.py b/src/diffusers/schedulers/scheduling_sde_vp.py
index a37a159a87ac..537d6f7e2a29 100644
--- a/src/diffusers/schedulers/scheduling_sde_vp.py
+++ b/src/diffusers/schedulers/scheduling_sde_vp.py
@@ -29,8 +29,8 @@ class ScoreSdeVpScheduler(SchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     For more information, see the original paper: https://arxiv.org/abs/2011.13456
 
diff --git a/src/diffusers/schedulers/scheduling_utils.py b/src/diffusers/schedulers/scheduling_utils.py
index 29bf982f6adf..90ab674e38a4 100644
--- a/src/diffusers/schedulers/scheduling_utils.py
+++ b/src/diffusers/schedulers/scheduling_utils.py
@@ -11,7 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib
+import os
 from dataclasses import dataclass
+from typing import Any, Dict, Optional, Union
 
 import torch
 
@@ -38,6 +41,114 @@ class SchedulerOutput(BaseOutput):
 class SchedulerMixin:
     """
     Mixin containing common functions for the schedulers.
+
+    Class attributes:
+        - **_compatibles** (`List[str]`) -- A list of classes that are compatible with the parent class, so that
+          `from_config` can be used from a class different than the one used to save the config (should be overridden
+          by parent class).
     """
 
     config_name = SCHEDULER_CONFIG_NAME
+    _compatibles = []
+    has_compatibles = True
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Dict[str, Any] = None,
+        subfolder: Optional[str] = None,
+        return_unused_kwargs=False,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a Scheduler class from a pre-defined JSON configuration file inside a directory or Hub repo.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *model id* of a model repo on huggingface.co. Valid model ids should have an
+                      organization name, like `google/ddpm-celebahq-256`.
+                    - A path to a *directory* containing the schedluer configurations saved using
+                      [`~SchedulerMixin.save_pretrained`], e.g., `./my_model_directory/`.
+            subfolder (`str`, *optional*):
+                In case the relevant files are located inside a subfolder of the model repo (either remote in
+                huggingface.co or downloaded locally), you can specify the folder name here.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                Whether kwargs that are not consumed by the Python class should be returned or not.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `transformers-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+
+        <Tip>
+
+         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
+         models](https://huggingface.co/docs/hub/models-gated#gated-models).
+
+        </Tip>
+
+        <Tip>
+
+        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
+        use this method in a firewalled environment.
+
+        </Tip>
+
+        """
+        config, kwargs = cls.load_config(
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            subfolder=subfolder,
+            return_unused_kwargs=True,
+            **kwargs,
+        )
+        return cls.from_config(config, return_unused_kwargs=return_unused_kwargs, **kwargs)
+
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+        """
+        Save a scheduler configuration object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~SchedulerMixin.from_pretrained`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file will be saved (will be created if it does not exist).
+        """
+        self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)
+
+    @property
+    def compatibles(self):
+        """
+        Returns all schedulers that are compatible with this scheduler
+
+        Returns:
+            `List[SchedulerMixin]`: List of compatible schedulers
+        """
+        return self._get_compatibles()
+
+    @classmethod
+    def _get_compatibles(cls):
+        compatible_classes_str = list(set([cls.__name__] + cls._compatibles))
+        diffusers_library = importlib.import_module(__name__.split(".")[0])
+        compatible_classes = [
+            getattr(diffusers_library, c) for c in compatible_classes_str if hasattr(diffusers_library, c)
+        ]
+        return compatible_classes
diff --git a/src/diffusers/schedulers/scheduling_utils_flax.py b/src/diffusers/schedulers/scheduling_utils_flax.py
index e545cfe2479e..b3024ca450d0 100644
--- a/src/diffusers/schedulers/scheduling_utils_flax.py
+++ b/src/diffusers/schedulers/scheduling_utils_flax.py
@@ -11,15 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib
+import os
 from dataclasses import dataclass
-from typing import Tuple
+from typing import Any, Dict, Optional, Tuple, Union
 
 import jax.numpy as jnp
 
-from ..utils import BaseOutput
+from ..utils import _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS, BaseOutput
 
 
 SCHEDULER_CONFIG_NAME = "scheduler_config.json"
+_FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS = ["Flax" + c for c in _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS]
 
 
 @dataclass
@@ -39,9 +42,123 @@ class FlaxSchedulerOutput(BaseOutput):
 class FlaxSchedulerMixin:
     """
     Mixin containing common functions for the schedulers.
+
+    Class attributes:
+        - **_compatibles** (`List[str]`) -- A list of classes that are compatible with the parent class, so that
+          `from_config` can be used from a class different than the one used to save the config (should be overridden
+          by parent class).
     """
 
     config_name = SCHEDULER_CONFIG_NAME
+    _compatibles = []
+    has_compatibles = True
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Dict[str, Any] = None,
+        subfolder: Optional[str] = None,
+        return_unused_kwargs=False,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a Scheduler class from a pre-defined JSON-file.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *model id* of a model repo on huggingface.co. Valid model ids should have an
+                      organization name, like `google/ddpm-celebahq-256`.
+                    - A path to a *directory* containing model weights saved using [`~SchedulerMixin.save_pretrained`],
+                      e.g., `./my_model_directory/`.
+            subfolder (`str`, *optional*):
+                In case the relevant files are located inside a subfolder of the model repo (either remote in
+                huggingface.co or downloaded locally), you can specify the folder name here.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                Whether kwargs that are not consumed by the Python class should be returned or not.
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `transformers-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+
+        <Tip>
+
+         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
+         models](https://huggingface.co/docs/hub/models-gated#gated-models).
+
+        </Tip>
+
+        <Tip>
+
+        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
+        use this method in a firewalled environment.
+
+        </Tip>
+
+        """
+        config, kwargs = cls.load_config(
+            pretrained_model_name_or_path=pretrained_model_name_or_path, return_unused_kwargs=True, **kwargs
+        )
+        scheduler, unused_kwargs = cls.from_config(config, return_unused_kwargs=True, **kwargs)
+
+        if hasattr(scheduler, "create_state") and getattr(scheduler, "has_state", False):
+            state = scheduler.create_state()
+
+        if return_unused_kwargs:
+            return scheduler, state, unused_kwargs
+
+        return scheduler, state
+
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+        """
+        Save a scheduler configuration object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~FlaxSchedulerMixin.from_pretrained`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file will be saved (will be created if it does not exist).
+        """
+        self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)
+
+    @property
+    def compatibles(self):
+        """
+        Returns all schedulers that are compatible with this scheduler
+
+        Returns:
+            `List[SchedulerMixin]`: List of compatible schedulers
+        """
+        return self._get_compatibles()
+
+    @classmethod
+    def _get_compatibles(cls):
+        compatible_classes_str = list(set([cls.__name__] + cls._compatibles))
+        diffusers_library = importlib.import_module(__name__.split(".")[0])
+        compatible_classes = [
+            getattr(diffusers_library, c) for c in compatible_classes_str if hasattr(diffusers_library, c)
+        ]
+        return compatible_classes
 
 
 def broadcast_to_shape_from_left(x: jnp.ndarray, shape: Tuple[int]) -> jnp.ndarray:
diff --git a/src/diffusers/schedulers/scheduling_vq_diffusion.py b/src/diffusers/schedulers/scheduling_vq_diffusion.py
index dbe320d998a3..91c46e655451 100644
--- a/src/diffusers/schedulers/scheduling_vq_diffusion.py
+++ b/src/diffusers/schedulers/scheduling_vq_diffusion.py
@@ -112,8 +112,8 @@ class VQDiffusionScheduler(SchedulerMixin, ConfigMixin):
 
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
 
     For more details, see the original paper: https://arxiv.org/abs/2111.14822
 
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index a00e1f4dcd4c..a80d24949899 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -72,3 +72,13 @@
 DIFFUSERS_CACHE = default_cache_path
 DIFFUSERS_DYNAMIC_MODULE_NAME = "diffusers_modules"
 HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(hf_cache_home, "modules"))
+
+_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS = [
+    "DDIMScheduler",
+    "DDPMScheduler",
+    "PNDMScheduler",
+    "LMSDiscreteScheduler",
+    "EulerDiscreteScheduler",
+    "EulerAncestralDiscreteScheduler",
+    "DPMSolverMultistepScheduler",
+]
diff --git a/tests/models/test_models_unet_1d.py b/tests/models/test_models_unet_1d.py
index 41c4fdecfa0a..089d935651a5 100644
--- a/tests/models/test_models_unet_1d.py
+++ b/tests/models/test_models_unet_1d.py
@@ -67,8 +67,8 @@ def test_from_pretrained_save_pretrained(self):
         super().test_from_pretrained_save_pretrained()
 
     @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
-    def test_model_from_config(self):
-        super().test_model_from_config()
+    def test_model_from_pretrained(self):
+        super().test_model_from_pretrained()
 
     @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
     def test_output(self):
@@ -187,8 +187,8 @@ def test_from_pretrained_save_pretrained(self):
         super().test_from_pretrained_save_pretrained()
 
     @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
-    def test_model_from_config(self):
-        super().test_model_from_config()
+    def test_model_from_pretrained(self):
+        super().test_model_from_pretrained()
 
     @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
     def test_output(self):
diff --git a/tests/pipelines/ddim/test_ddim.py b/tests/pipelines/ddim/test_ddim.py
index 81c49912be5c..2d03383599e0 100644
--- a/tests/pipelines/ddim/test_ddim.py
+++ b/tests/pipelines/ddim/test_ddim.py
@@ -75,7 +75,7 @@ def test_inference_ema_bedroom(self):
         model_id = "google/ddpm-ema-bedroom-256"
 
         unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = DDIMScheduler.from_config(model_id)
+        scheduler = DDIMScheduler.from_pretrained(model_id)
 
         ddpm = DDIMPipeline(unet=unet, scheduler=scheduler)
         ddpm.to(torch_device)
diff --git a/tests/pipelines/ddpm/test_ddpm.py b/tests/pipelines/ddpm/test_ddpm.py
index 14bc09469773..ef293109bf7d 100644
--- a/tests/pipelines/ddpm/test_ddpm.py
+++ b/tests/pipelines/ddpm/test_ddpm.py
@@ -106,7 +106,7 @@ def test_inference_cifar10(self):
         model_id = "google/ddpm-cifar10-32"
 
         unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = DDPMScheduler.from_config(model_id)
+        scheduler = DDPMScheduler.from_pretrained(model_id)
 
         ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
         ddpm.to(torch_device)
diff --git a/tests/pipelines/repaint/test_repaint.py b/tests/pipelines/repaint/test_repaint.py
index 23544dfd2468..3ab0efc875f1 100644
--- a/tests/pipelines/repaint/test_repaint.py
+++ b/tests/pipelines/repaint/test_repaint.py
@@ -44,7 +44,7 @@ def test_celebahq(self):
 
         model_id = "google/ddpm-ema-celebahq-256"
         unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = RePaintScheduler.from_config(model_id)
+        scheduler = RePaintScheduler.from_pretrained(model_id)
 
         repaint = RePaintPipeline(unet=unet, scheduler=scheduler).to(torch_device)
 
diff --git a/tests/pipelines/score_sde_ve/test_score_sde_ve.py b/tests/pipelines/score_sde_ve/test_score_sde_ve.py
index 55dcc1cea1e0..9cdf3f0191e1 100644
--- a/tests/pipelines/score_sde_ve/test_score_sde_ve.py
+++ b/tests/pipelines/score_sde_ve/test_score_sde_ve.py
@@ -74,7 +74,7 @@ def test_inference(self):
         model_id = "google/ncsnpp-church-256"
         model = UNet2DModel.from_pretrained(model_id)
 
-        scheduler = ScoreSdeVeScheduler.from_config(model_id)
+        scheduler = ScoreSdeVeScheduler.from_pretrained(model_id)
 
         sde_ve = ScoreSdeVePipeline(unet=model, scheduler=scheduler)
         sde_ve.to(torch_device)
diff --git a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
index de918c7e5c22..7a32b74096c4 100644
--- a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
@@ -281,7 +281,7 @@ def test_cycle_diffusion_pipeline_fp16(self):
         init_image = init_image.resize((512, 512))
 
         model_id = "CompVis/stable-diffusion-v1-4"
-        scheduler = DDIMScheduler.from_config(model_id, subfolder="scheduler")
+        scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
         pipe = CycleDiffusionPipeline.from_pretrained(
             model_id, scheduler=scheduler, safety_checker=None, torch_dtype=torch.float16, revision="fp16"
         )
@@ -322,7 +322,7 @@ def test_cycle_diffusion_pipeline(self):
         init_image = init_image.resize((512, 512))
 
         model_id = "CompVis/stable-diffusion-v1-4"
-        scheduler = DDIMScheduler.from_config(model_id, subfolder="scheduler")
+        scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
         pipe = CycleDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, safety_checker=None)
 
         pipe.to(torch_device)
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
index a1946e39f912..a2b48d27e6e0 100644
--- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
@@ -75,7 +75,7 @@ def test_inference_default_pndm(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
     def test_inference_ddim(self):
-        ddim_scheduler = DDIMScheduler.from_config(
+        ddim_scheduler = DDIMScheduler.from_pretrained(
             "runwayml/stable-diffusion-v1-5", subfolder="scheduler", revision="onnx"
         )
         sd_pipe = OnnxStableDiffusionPipeline.from_pretrained(
@@ -98,7 +98,7 @@ def test_inference_ddim(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
     def test_inference_k_lms(self):
-        lms_scheduler = LMSDiscreteScheduler.from_config(
+        lms_scheduler = LMSDiscreteScheduler.from_pretrained(
             "runwayml/stable-diffusion-v1-5", subfolder="scheduler", revision="onnx"
         )
         sd_pipe = OnnxStableDiffusionPipeline.from_pretrained(
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
index 61831c64c0ae..91e4412425b4 100644
--- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
@@ -93,7 +93,7 @@ def test_inference_k_lms(self):
             "/img2img/sketch-mountains-input.jpg"
         )
         init_image = init_image.resize((768, 512))
-        lms_scheduler = LMSDiscreteScheduler.from_config(
+        lms_scheduler = LMSDiscreteScheduler.from_pretrained(
             "runwayml/stable-diffusion-v1-5", subfolder="scheduler", revision="onnx"
         )
         pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
index 4ba8e273b497..507375bddbfd 100644
--- a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
@@ -97,7 +97,7 @@ def test_inference_k_lms(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
         )
-        lms_scheduler = LMSDiscreteScheduler.from_config(
+        lms_scheduler = LMSDiscreteScheduler.from_pretrained(
             "runwayml/stable-diffusion-inpainting", subfolder="scheduler", revision="onnx"
         )
         pipe = OnnxStableDiffusionInpaintPipeline.from_pretrained(
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 87d238c869a3..17a293e605fd 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -703,7 +703,7 @@ def test_stable_diffusion(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
     def test_stable_diffusion_fast_ddim(self):
-        scheduler = DDIMScheduler.from_config("CompVis/stable-diffusion-v1-1", subfolder="scheduler")
+        scheduler = DDIMScheduler.from_pretrained("CompVis/stable-diffusion-v1-1", subfolder="scheduler")
 
         sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1", scheduler=scheduler)
         sd_pipe = sd_pipe.to(torch_device)
@@ -726,7 +726,7 @@ def test_lms_stable_diffusion_pipeline(self):
         model_id = "CompVis/stable-diffusion-v1-1"
         pipe = StableDiffusionPipeline.from_pretrained(model_id).to(torch_device)
         pipe.set_progress_bar_config(disable=None)
-        scheduler = LMSDiscreteScheduler.from_config(model_id, subfolder="scheduler")
+        scheduler = LMSDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
         pipe.scheduler = scheduler
 
         prompt = "a photograph of an astronaut riding a horse"
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
index 3c0fa8aa818f..d86b259eae9e 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
@@ -520,7 +520,7 @@ def test_stable_diffusion_img2img_pipeline_k_lms(self):
         )
 
         model_id = "CompVis/stable-diffusion-v1-4"
-        lms = LMSDiscreteScheduler.from_config(model_id, subfolder="scheduler")
+        lms = LMSDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
         pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
             model_id,
             scheduler=lms,
@@ -557,7 +557,7 @@ def test_stable_diffusion_img2img_pipeline_ddim(self):
         )
 
         model_id = "CompVis/stable-diffusion-v1-4"
-        ddim = DDIMScheduler.from_config(model_id, subfolder="scheduler")
+        ddim = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
         pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
             model_id,
             scheduler=ddim,
@@ -649,7 +649,7 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
         init_image = init_image.resize((768, 512))
 
         model_id = "CompVis/stable-diffusion-v1-4"
-        lms = LMSDiscreteScheduler.from_config(model_id, subfolder="scheduler")
+        lms = LMSDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
         pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
             model_id, scheduler=lms, safety_checker=None, device_map="auto", revision="fp16", torch_dtype=torch.float16
         )
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 8d269c38f9f4..ce231a1a460b 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -400,7 +400,7 @@ def test_stable_diffusion_inpaint_pipeline_pndm(self):
         )
 
         model_id = "runwayml/stable-diffusion-inpainting"
-        pndm = PNDMScheduler.from_config(model_id, subfolder="scheduler")
+        pndm = PNDMScheduler.from_pretrained(model_id, subfolder="scheduler")
         pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None, scheduler=pndm)
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
@@ -437,7 +437,7 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
         )
 
         model_id = "runwayml/stable-diffusion-inpainting"
-        pndm = PNDMScheduler.from_config(model_id, subfolder="scheduler")
+        pndm = PNDMScheduler.from_pretrained(model_id, subfolder="scheduler")
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
             model_id,
             safety_checker=None,
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
index 4b535dc9dfbd..94106b6ba819 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
@@ -401,7 +401,7 @@ def test_stable_diffusion_inpaint_legacy_pipeline_k_lms(self):
         )
 
         model_id = "CompVis/stable-diffusion-v1-4"
-        lms = LMSDiscreteScheduler.from_config(model_id, subfolder="scheduler")
+        lms = LMSDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
         pipe = StableDiffusionInpaintPipeline.from_pretrained(
             model_id,
             scheduler=lms,
diff --git a/tests/test_config.py b/tests/test_config.py
index 8ae8e1d9e173..0875930e37cc 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -13,12 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
-import os
 import tempfile
 import unittest
 
-import diffusers
 from diffusers import (
     DDIMScheduler,
     DDPMScheduler,
@@ -81,7 +78,7 @@ def __init__(
 class ConfigTester(unittest.TestCase):
     def test_load_not_from_mixin(self):
         with self.assertRaises(ValueError):
-            ConfigMixin.from_config("dummy_path")
+            ConfigMixin.load_config("dummy_path")
 
     def test_register_to_config(self):
         obj = SampleObject()
@@ -131,7 +128,7 @@ def test_save_load(self):
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             obj.save_config(tmpdirname)
-            new_obj = SampleObject.from_config(tmpdirname)
+            new_obj = SampleObject.from_config(SampleObject.load_config(tmpdirname))
             new_config = new_obj.config
 
         # unfreeze configs
@@ -142,117 +139,13 @@ def test_save_load(self):
         assert new_config.pop("c") == [2, 5]  # saved & loaded as list because of json
         assert config == new_config
 
-    def test_save_load_from_different_config(self):
-        obj = SampleObject()
-
-        # mock add obj class to `diffusers`
-        setattr(diffusers, "SampleObject", SampleObject)
-        logger = logging.get_logger("diffusers.configuration_utils")
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            obj.save_config(tmpdirname)
-            with CaptureLogger(logger) as cap_logger_1:
-                new_obj_1 = SampleObject2.from_config(tmpdirname)
-
-            # now save a config parameter that is not expected
-            with open(os.path.join(tmpdirname, SampleObject.config_name), "r") as f:
-                data = json.load(f)
-                data["unexpected"] = True
-
-            with open(os.path.join(tmpdirname, SampleObject.config_name), "w") as f:
-                json.dump(data, f)
-
-            with CaptureLogger(logger) as cap_logger_2:
-                new_obj_2 = SampleObject.from_config(tmpdirname)
-
-            with CaptureLogger(logger) as cap_logger_3:
-                new_obj_3 = SampleObject2.from_config(tmpdirname)
-
-        assert new_obj_1.__class__ == SampleObject2
-        assert new_obj_2.__class__ == SampleObject
-        assert new_obj_3.__class__ == SampleObject2
-
-        assert cap_logger_1.out == ""
-        assert (
-            cap_logger_2.out
-            == "The config attributes {'unexpected': True} were passed to SampleObject, but are not expected and will"
-            " be ignored. Please verify your config.json configuration file.\n"
-        )
-        assert cap_logger_2.out.replace("SampleObject", "SampleObject2") == cap_logger_3.out
-
-    def test_save_load_compatible_schedulers(self):
-        SampleObject2._compatible_classes = ["SampleObject"]
-        SampleObject._compatible_classes = ["SampleObject2"]
-
-        obj = SampleObject()
-
-        # mock add obj class to `diffusers`
-        setattr(diffusers, "SampleObject", SampleObject)
-        setattr(diffusers, "SampleObject2", SampleObject2)
-        logger = logging.get_logger("diffusers.configuration_utils")
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            obj.save_config(tmpdirname)
-
-            # now save a config parameter that is expected by another class, but not origin class
-            with open(os.path.join(tmpdirname, SampleObject.config_name), "r") as f:
-                data = json.load(f)
-                data["f"] = [0, 0]
-                data["unexpected"] = True
-
-            with open(os.path.join(tmpdirname, SampleObject.config_name), "w") as f:
-                json.dump(data, f)
-
-            with CaptureLogger(logger) as cap_logger:
-                new_obj = SampleObject.from_config(tmpdirname)
-
-        assert new_obj.__class__ == SampleObject
-
-        assert (
-            cap_logger.out
-            == "The config attributes {'unexpected': True} were passed to SampleObject, but are not expected and will"
-            " be ignored. Please verify your config.json configuration file.\n"
-        )
-
-    def test_save_load_from_different_config_comp_schedulers(self):
-        SampleObject3._compatible_classes = ["SampleObject", "SampleObject2"]
-        SampleObject2._compatible_classes = ["SampleObject", "SampleObject3"]
-        SampleObject._compatible_classes = ["SampleObject2", "SampleObject3"]
-
-        obj = SampleObject()
-
-        # mock add obj class to `diffusers`
-        setattr(diffusers, "SampleObject", SampleObject)
-        setattr(diffusers, "SampleObject2", SampleObject2)
-        setattr(diffusers, "SampleObject3", SampleObject3)
-        logger = logging.get_logger("diffusers.configuration_utils")
-        logger.setLevel(diffusers.logging.INFO)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            obj.save_config(tmpdirname)
-
-            with CaptureLogger(logger) as cap_logger_1:
-                new_obj_1 = SampleObject.from_config(tmpdirname)
-
-            with CaptureLogger(logger) as cap_logger_2:
-                new_obj_2 = SampleObject2.from_config(tmpdirname)
-
-            with CaptureLogger(logger) as cap_logger_3:
-                new_obj_3 = SampleObject3.from_config(tmpdirname)
-
-        assert new_obj_1.__class__ == SampleObject
-        assert new_obj_2.__class__ == SampleObject2
-        assert new_obj_3.__class__ == SampleObject3
-
-        assert cap_logger_1.out == ""
-        assert cap_logger_2.out == "{'f'} was not found in config. Values will be initialized to default values.\n"
-        assert cap_logger_3.out == "{'f'} was not found in config. Values will be initialized to default values.\n"
-
     def test_load_ddim_from_pndm(self):
         logger = logging.get_logger("diffusers.configuration_utils")
 
         with CaptureLogger(logger) as cap_logger:
-            ddim = DDIMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
+            ddim = DDIMScheduler.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
+            )
 
         assert ddim.__class__ == DDIMScheduler
         # no warning should be thrown
@@ -262,7 +155,7 @@ def test_load_euler_from_pndm(self):
         logger = logging.get_logger("diffusers.configuration_utils")
 
         with CaptureLogger(logger) as cap_logger:
-            euler = EulerDiscreteScheduler.from_config(
+            euler = EulerDiscreteScheduler.from_pretrained(
                 "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
             )
 
@@ -274,7 +167,7 @@ def test_load_euler_ancestral_from_pndm(self):
         logger = logging.get_logger("diffusers.configuration_utils")
 
         with CaptureLogger(logger) as cap_logger:
-            euler = EulerAncestralDiscreteScheduler.from_config(
+            euler = EulerAncestralDiscreteScheduler.from_pretrained(
                 "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
             )
 
@@ -286,7 +179,9 @@ def test_load_pndm(self):
         logger = logging.get_logger("diffusers.configuration_utils")
 
         with CaptureLogger(logger) as cap_logger:
-            pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
+            pndm = PNDMScheduler.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
+            )
 
         assert pndm.__class__ == PNDMScheduler
         # no warning should be thrown
@@ -296,7 +191,7 @@ def test_overwrite_config_on_load(self):
         logger = logging.get_logger("diffusers.configuration_utils")
 
         with CaptureLogger(logger) as cap_logger:
-            ddpm = DDPMScheduler.from_config(
+            ddpm = DDPMScheduler.from_pretrained(
                 "hf-internal-testing/tiny-stable-diffusion-torch",
                 subfolder="scheduler",
                 predict_epsilon=False,
@@ -304,7 +199,7 @@ def test_overwrite_config_on_load(self):
             )
 
         with CaptureLogger(logger) as cap_logger_2:
-            ddpm_2 = DDPMScheduler.from_config("google/ddpm-celebahq-256", beta_start=88)
+            ddpm_2 = DDPMScheduler.from_pretrained("google/ddpm-celebahq-256", beta_start=88)
 
         assert ddpm.__class__ == DDPMScheduler
         assert ddpm.config.predict_epsilon is False
@@ -319,7 +214,7 @@ def test_load_dpmsolver(self):
         logger = logging.get_logger("diffusers.configuration_utils")
 
         with CaptureLogger(logger) as cap_logger:
-            dpm = DPMSolverMultistepScheduler.from_config(
+            dpm = DPMSolverMultistepScheduler.from_pretrained(
                 "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
             )
 
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index eabe6ada9fea..49bb4f6deb33 100644
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -130,7 +130,7 @@ def test_forward_signature(self):
         expected_arg_names = ["sample", "timestep"]
         self.assertListEqual(arg_names[:2], expected_arg_names)
 
-    def test_model_from_config(self):
+    def test_model_from_pretrained(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
 
         model = self.model_class(**init_dict)
@@ -140,8 +140,8 @@ def test_model_from_config(self):
         # test if the model can be loaded from the config
         # and has all the expected shape
         with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_config(tmpdirname)
-            new_model = self.model_class.from_config(tmpdirname)
+            model.save_pretrained(tmpdirname)
+            new_model = self.model_class.from_pretrained(tmpdirname)
             new_model.to(torch_device)
             new_model.eval()
 
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 4559d713ed81..c77b000292a5 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -29,6 +29,10 @@
     DDIMScheduler,
     DDPMPipeline,
     DDPMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
     PNDMScheduler,
     StableDiffusionImg2ImgPipeline,
     StableDiffusionInpaintPipelineLegacy,
@@ -398,6 +402,82 @@ def test_components(self):
         assert image_img2img.shape == (1, 32, 32, 3)
         assert image_text2img.shape == (1, 128, 128, 3)
 
+    def test_set_scheduler(self):
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        sd = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+
+        sd.scheduler = DDIMScheduler.from_config(sd.scheduler.config)
+        assert isinstance(sd.scheduler, DDIMScheduler)
+        sd.scheduler = DDPMScheduler.from_config(sd.scheduler.config)
+        assert isinstance(sd.scheduler, DDPMScheduler)
+        sd.scheduler = PNDMScheduler.from_config(sd.scheduler.config)
+        assert isinstance(sd.scheduler, PNDMScheduler)
+        sd.scheduler = LMSDiscreteScheduler.from_config(sd.scheduler.config)
+        assert isinstance(sd.scheduler, LMSDiscreteScheduler)
+        sd.scheduler = EulerDiscreteScheduler.from_config(sd.scheduler.config)
+        assert isinstance(sd.scheduler, EulerDiscreteScheduler)
+        sd.scheduler = EulerAncestralDiscreteScheduler.from_config(sd.scheduler.config)
+        assert isinstance(sd.scheduler, EulerAncestralDiscreteScheduler)
+        sd.scheduler = DPMSolverMultistepScheduler.from_config(sd.scheduler.config)
+        assert isinstance(sd.scheduler, DPMSolverMultistepScheduler)
+
+    def test_set_scheduler_consistency(self):
+        unet = self.dummy_cond_unet
+        pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
+        ddim = DDIMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        sd = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=pndm,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+
+        pndm_config = sd.scheduler.config
+        sd.scheduler = DDPMScheduler.from_config(pndm_config)
+        sd.scheduler = PNDMScheduler.from_config(sd.scheduler.config)
+        pndm_config_2 = sd.scheduler.config
+        pndm_config_2 = {k: v for k, v in pndm_config_2.items() if k in pndm_config}
+
+        assert dict(pndm_config) == dict(pndm_config_2)
+
+        sd = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=ddim,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+
+        ddim_config = sd.scheduler.config
+        sd.scheduler = LMSDiscreteScheduler.from_config(ddim_config)
+        sd.scheduler = DDIMScheduler.from_config(sd.scheduler.config)
+        ddim_config_2 = sd.scheduler.config
+        ddim_config_2 = {k: v for k, v in ddim_config_2.items() if k in ddim_config}
+
+        assert dict(ddim_config) == dict(ddim_config_2)
+
 
 @slow
 class PipelineSlowTests(unittest.TestCase):
@@ -519,7 +599,7 @@ def test_from_pretrained_hub_pass_model(self):
     def test_output_format(self):
         model_path = "google/ddpm-cifar10-32"
 
-        scheduler = DDIMScheduler.from_config(model_path)
+        scheduler = DDIMScheduler.from_pretrained(model_path)
         pipe = DDIMPipeline.from_pretrained(model_path, scheduler=scheduler)
         pipe.to(torch_device)
         pipe.set_progress_bar_config(disable=None)
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index a9770f0a54a8..9c9abd09732b 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
+import json
+import os
 import tempfile
 import unittest
 from typing import Dict, List, Tuple
@@ -21,6 +23,7 @@
 import torch
 import torch.nn.functional as F
 
+import diffusers
 from diffusers import (
     DDIMScheduler,
     DDPMScheduler,
@@ -32,13 +35,180 @@
     PNDMScheduler,
     ScoreSdeVeScheduler,
     VQDiffusionScheduler,
+    logging,
 )
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
 from diffusers.utils import deprecate, torch_device
+from diffusers.utils.testing_utils import CaptureLogger
 
 
 torch.backends.cuda.matmul.allow_tf32 = False
 
 
+class SchedulerObject(SchedulerMixin, ConfigMixin):
+    config_name = "config.json"
+
+    @register_to_config
+    def __init__(
+        self,
+        a=2,
+        b=5,
+        c=(2, 5),
+        d="for diffusion",
+        e=[1, 3],
+    ):
+        pass
+
+
+class SchedulerObject2(SchedulerMixin, ConfigMixin):
+    config_name = "config.json"
+
+    @register_to_config
+    def __init__(
+        self,
+        a=2,
+        b=5,
+        c=(2, 5),
+        d="for diffusion",
+        f=[1, 3],
+    ):
+        pass
+
+
+class SchedulerObject3(SchedulerMixin, ConfigMixin):
+    config_name = "config.json"
+
+    @register_to_config
+    def __init__(
+        self,
+        a=2,
+        b=5,
+        c=(2, 5),
+        d="for diffusion",
+        e=[1, 3],
+        f=[1, 3],
+    ):
+        pass
+
+
+class SchedulerBaseTests(unittest.TestCase):
+    def test_save_load_from_different_config(self):
+        obj = SchedulerObject()
+
+        # mock add obj class to `diffusers`
+        setattr(diffusers, "SchedulerObject", SchedulerObject)
+        logger = logging.get_logger("diffusers.configuration_utils")
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            obj.save_config(tmpdirname)
+            with CaptureLogger(logger) as cap_logger_1:
+                config = SchedulerObject2.load_config(tmpdirname)
+                new_obj_1 = SchedulerObject2.from_config(config)
+
+            # now save a config parameter that is not expected
+            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "r") as f:
+                data = json.load(f)
+                data["unexpected"] = True
+
+            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "w") as f:
+                json.dump(data, f)
+
+            with CaptureLogger(logger) as cap_logger_2:
+                config = SchedulerObject.load_config(tmpdirname)
+                new_obj_2 = SchedulerObject.from_config(config)
+
+            with CaptureLogger(logger) as cap_logger_3:
+                config = SchedulerObject2.load_config(tmpdirname)
+                new_obj_3 = SchedulerObject2.from_config(config)
+
+        assert new_obj_1.__class__ == SchedulerObject2
+        assert new_obj_2.__class__ == SchedulerObject
+        assert new_obj_3.__class__ == SchedulerObject2
+
+        assert cap_logger_1.out == ""
+        assert (
+            cap_logger_2.out
+            == "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and"
+            " will"
+            " be ignored. Please verify your config.json configuration file.\n"
+        )
+        assert cap_logger_2.out.replace("SchedulerObject", "SchedulerObject2") == cap_logger_3.out
+
+    def test_save_load_compatible_schedulers(self):
+        SchedulerObject2._compatibles = ["SchedulerObject"]
+        SchedulerObject._compatibles = ["SchedulerObject2"]
+
+        obj = SchedulerObject()
+
+        # mock add obj class to `diffusers`
+        setattr(diffusers, "SchedulerObject", SchedulerObject)
+        setattr(diffusers, "SchedulerObject2", SchedulerObject2)
+        logger = logging.get_logger("diffusers.configuration_utils")
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            obj.save_config(tmpdirname)
+
+            # now save a config parameter that is expected by another class, but not origin class
+            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "r") as f:
+                data = json.load(f)
+                data["f"] = [0, 0]
+                data["unexpected"] = True
+
+            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "w") as f:
+                json.dump(data, f)
+
+            with CaptureLogger(logger) as cap_logger:
+                config = SchedulerObject.load_config(tmpdirname)
+                new_obj = SchedulerObject.from_config(config)
+
+        assert new_obj.__class__ == SchedulerObject
+
+        assert (
+            cap_logger.out
+            == "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and"
+            " will"
+            " be ignored. Please verify your config.json configuration file.\n"
+        )
+
+    def test_save_load_from_different_config_comp_schedulers(self):
+        SchedulerObject3._compatibles = ["SchedulerObject", "SchedulerObject2"]
+        SchedulerObject2._compatibles = ["SchedulerObject", "SchedulerObject3"]
+        SchedulerObject._compatibles = ["SchedulerObject2", "SchedulerObject3"]
+
+        obj = SchedulerObject()
+
+        # mock add obj class to `diffusers`
+        setattr(diffusers, "SchedulerObject", SchedulerObject)
+        setattr(diffusers, "SchedulerObject2", SchedulerObject2)
+        setattr(diffusers, "SchedulerObject3", SchedulerObject3)
+        logger = logging.get_logger("diffusers.configuration_utils")
+        logger.setLevel(diffusers.logging.INFO)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            obj.save_config(tmpdirname)
+
+            with CaptureLogger(logger) as cap_logger_1:
+                config = SchedulerObject.load_config(tmpdirname)
+                new_obj_1 = SchedulerObject.from_config(config)
+
+            with CaptureLogger(logger) as cap_logger_2:
+                config = SchedulerObject2.load_config(tmpdirname)
+                new_obj_2 = SchedulerObject2.from_config(config)
+
+            with CaptureLogger(logger) as cap_logger_3:
+                config = SchedulerObject3.load_config(tmpdirname)
+                new_obj_3 = SchedulerObject3.from_config(config)
+
+        assert new_obj_1.__class__ == SchedulerObject
+        assert new_obj_2.__class__ == SchedulerObject2
+        assert new_obj_3.__class__ == SchedulerObject3
+
+        assert cap_logger_1.out == ""
+        assert cap_logger_2.out == "{'f'} was not found in config. Values will be initialized to default values.\n"
+        assert cap_logger_3.out == "{'f'} was not found in config. Values will be initialized to default values.\n"
+
+
 class SchedulerCommonTest(unittest.TestCase):
     scheduler_classes = ()
     forward_default_kwargs = ()
@@ -102,7 +272,7 @@ def check_over_configs(self, time_step=0, **config):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
 
             if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
@@ -145,7 +315,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
 
             if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
@@ -187,7 +357,7 @@ def test_from_pretrained_save_pretrained(self):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
 
             if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 scheduler.set_timesteps(num_inference_steps)
@@ -205,6 +375,42 @@ def test_from_pretrained_save_pretrained(self):
 
             assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
 
+    def test_compatibles(self):
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+
+            scheduler = scheduler_class(**scheduler_config)
+
+            assert all(c is not None for c in scheduler.compatibles)
+
+            for comp_scheduler_cls in scheduler.compatibles:
+                comp_scheduler = comp_scheduler_cls.from_config(scheduler.config)
+                assert comp_scheduler is not None
+
+            new_scheduler = scheduler_class.from_config(comp_scheduler.config)
+
+            new_scheduler_config = {k: v for k, v in new_scheduler.config.items() if k in scheduler.config}
+            scheduler_diff = {k: v for k, v in new_scheduler.config.items() if k not in scheduler.config}
+
+            # make sure that configs are essentially identical
+            assert new_scheduler_config == dict(scheduler.config)
+
+            # make sure that only differences are for configs that are not in init
+            init_keys = inspect.signature(scheduler_class.__init__).parameters.keys()
+            assert set(scheduler_diff.keys()).intersection(set(init_keys)) == set()
+
+    def test_from_pretrained(self):
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config()
+
+            scheduler = scheduler_class(**scheduler_config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                scheduler.save_pretrained(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
+
+            assert scheduler.config == new_scheduler.config
+
     def test_step_shape(self):
         kwargs = dict(self.forward_default_kwargs)
 
@@ -616,7 +822,7 @@ def check_over_configs(self, time_step=0, **config):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
                 new_scheduler.set_timesteps(num_inference_steps)
                 # copy over dummy past residuals
                 new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
@@ -648,7 +854,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
                 # copy over dummy past residuals
                 new_scheduler.set_timesteps(num_inference_steps)
 
@@ -790,7 +996,7 @@ def check_over_configs(self, time_step=0, **config):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
                 new_scheduler.set_timesteps(num_inference_steps)
                 # copy over dummy past residuals
                 new_scheduler.ets = dummy_past_residuals[:]
@@ -825,7 +1031,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
                 # copy over dummy past residuals
                 new_scheduler.set_timesteps(num_inference_steps)
 
@@ -1043,7 +1249,7 @@ def check_over_configs(self, time_step=0, **config):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
 
             output = scheduler.step_pred(
                 residual, time_step, sample, generator=torch.manual_seed(0), **kwargs
@@ -1074,7 +1280,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
 
             output = scheduler.step_pred(
                 residual, time_step, sample, generator=torch.manual_seed(0), **kwargs
@@ -1470,7 +1676,7 @@ def check_over_configs(self, time_step=0, **config):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
                 new_scheduler.set_timesteps(num_inference_steps)
                 # copy over dummy past residuals
                 new_scheduler.ets = dummy_past_residuals[:]
@@ -1508,7 +1714,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_config(tmpdirname)
+                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
                 # copy over dummy past residuals
                 new_scheduler.set_timesteps(num_inference_steps)
 
diff --git a/tests/test_scheduler_flax.py b/tests/test_scheduler_flax.py
index 7928939f2db4..0fa0e1b495bb 100644
--- a/tests/test_scheduler_flax.py
+++ b/tests/test_scheduler_flax.py
@@ -83,7 +83,7 @@ def check_over_configs(self, time_step=0, **config):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
-                new_scheduler, new_state = scheduler_class.from_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
 
             if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 state = scheduler.set_timesteps(state, num_inference_steps)
@@ -112,7 +112,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
-                new_scheduler, new_state = scheduler_class.from_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
 
             if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 state = scheduler.set_timesteps(state, num_inference_steps)
@@ -140,7 +140,7 @@ def test_from_pretrained_save_pretrained(self):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
-                new_scheduler, new_state = scheduler_class.from_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
 
             if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 state = scheduler.set_timesteps(state, num_inference_steps)
@@ -373,7 +373,7 @@ def check_over_configs(self, time_step=0, **config):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
-                new_scheduler, new_state = scheduler_class.from_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
 
             if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 state = scheduler.set_timesteps(state, num_inference_steps)
@@ -401,7 +401,7 @@ def test_from_pretrained_save_pretrained(self):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
-                new_scheduler, new_state = scheduler_class.from_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
 
             if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 state = scheduler.set_timesteps(state, num_inference_steps)
@@ -430,7 +430,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
-                new_scheduler, new_state = scheduler_class.from_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
 
             if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
                 state = scheduler.set_timesteps(state, num_inference_steps)
@@ -633,7 +633,7 @@ def check_over_configs(self, time_step=0, **config):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
-                new_scheduler, new_state = scheduler_class.from_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
                 new_state = new_scheduler.set_timesteps(new_state, num_inference_steps, shape=sample.shape)
                 # copy over dummy past residuals
                 new_state = new_state.replace(ets=dummy_past_residuals[:])
@@ -720,7 +720,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs):
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 scheduler.save_config(tmpdirname)
-                new_scheduler, new_state = scheduler_class.from_config(tmpdirname)
+                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
                 # copy over dummy past residuals
                 new_state = new_scheduler.set_timesteps(new_state, num_inference_steps, shape=sample.shape)
 

From 4625f04bc04bed43c1ba9b821149af121b5965ea Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 15 Nov 2022 17:34:00 +0000
Subject: [PATCH 191/269] remove bogus files

---
 ...onvert_versatile_diffusion_to_diffusers.py | 748 ------------------
 v1-inference.yaml                             |  70 --
 2 files changed, 818 deletions(-)
 delete mode 100644 scripts/convert_versatile_diffusion_to_diffusers.py
 delete mode 100644 v1-inference.yaml

diff --git a/scripts/convert_versatile_diffusion_to_diffusers.py b/scripts/convert_versatile_diffusion_to_diffusers.py
deleted file mode 100644
index 20ac78f944ea..000000000000
--- a/scripts/convert_versatile_diffusion_to_diffusers.py
+++ /dev/null
@@ -1,748 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Conversion script for the Versatile Stable Diffusion checkpoints. """
-
-import argparse
-import os
-
-import torch
-
-
-try:
-    from omegaconf import OmegaConf
-except ImportError:
-    raise ImportError(
-        "OmegaConf is required to convert the LDM checkpoints. Please install it with `pip install OmegaConf`."
-    )
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    LDMTextToImagePipeline,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
-from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from transformers import AutoFeatureExtractor, BertTokenizerFast, CLIPTextModel, CLIPTokenizer
-
-
-def shave_segments(path, n_shave_prefix_segments=1):
-    """
-    Removes segments. Positive values shave the first segments, negative shave the last segments.
-    """
-    if n_shave_prefix_segments >= 0:
-        return ".".join(path.split(".")[n_shave_prefix_segments:])
-    else:
-        return ".".join(path.split(".")[:n_shave_prefix_segments])
-
-
-def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item.replace("in_layers.0", "norm1")
-        new_item = new_item.replace("in_layers.2", "conv1")
-
-        new_item = new_item.replace("out_layers.0", "norm2")
-        new_item = new_item.replace("out_layers.3", "conv2")
-
-        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
-        new_item = new_item.replace("skip_connection", "conv_shortcut")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
-        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
-
-        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
-        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
-
-        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-
-        new_item = new_item.replace("norm.weight", "group_norm.weight")
-        new_item = new_item.replace("norm.bias", "group_norm.bias")
-
-        new_item = new_item.replace("q.weight", "query.weight")
-        new_item = new_item.replace("q.bias", "query.bias")
-
-        new_item = new_item.replace("k.weight", "key.weight")
-        new_item = new_item.replace("k.bias", "key.bias")
-
-        new_item = new_item.replace("v.weight", "value.weight")
-        new_item = new_item.replace("v.bias", "value.bias")
-
-        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
-        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
-
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-
-        mapping.append({"old": old_item, "new": new_item})
-
-    return mapping
-
-
-def assign_to_checkpoint(
-    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
-):
-    """
-    This does the final conversion step: take locally converted weights and apply a global renaming
-    to them. It splits attention layers, and takes into account additional replacements
-    that may arise.
-
-    Assigns the weights to the new checkpoint.
-    """
-    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
-
-    # Splits the attention layers into three variables.
-    if attention_paths_to_split is not None:
-        for path, path_map in attention_paths_to_split.items():
-            old_tensor = old_checkpoint[path]
-            channels = old_tensor.shape[0] // 3
-
-            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
-
-            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
-
-            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
-            query, key, value = old_tensor.split(channels // num_heads, dim=1)
-
-            checkpoint[path_map["query"]] = query.reshape(target_shape)
-            checkpoint[path_map["key"]] = key.reshape(target_shape)
-            checkpoint[path_map["value"]] = value.reshape(target_shape)
-
-    for path in paths:
-        new_path = path["new"]
-
-        # These have already been assigned
-        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
-            continue
-
-        # Global renaming happens here
-        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
-        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
-        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
-
-        if additional_replacements is not None:
-            for replacement in additional_replacements:
-                new_path = new_path.replace(replacement["old"], replacement["new"])
-
-        # proj_attn.weight has to be converted from conv 1D to linear
-        if "proj_attn.weight" in new_path:
-            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
-        else:
-            checkpoint[new_path] = old_checkpoint[path["old"]]
-
-
-def conv_attn_to_linear(checkpoint):
-    keys = list(checkpoint.keys())
-    attn_keys = ["query.weight", "key.weight", "value.weight"]
-    for key in keys:
-        if ".".join(key.split(".")[-2:]) in attn_keys:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0, 0]
-        elif "proj_attn.weight" in key:
-            if checkpoint[key].ndim > 2:
-                checkpoint[key] = checkpoint[key][:, :, 0]
-
-
-def create_unet_diffusers_config(original_config):
-    """
-    Creates a config for the diffusers based on the config of the LDM model.
-    """
-    unet_params = original_config.model.params.unet_config.params
-
-    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
-
-    down_block_types = []
-    resolution = 1
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
-        down_block_types.append(block_type)
-        if i != len(block_out_channels) - 1:
-            resolution *= 2
-
-    up_block_types = []
-    for i in range(len(block_out_channels)):
-        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
-        up_block_types.append(block_type)
-        resolution //= 2
-
-    config = dict(
-        sample_size=unet_params.image_size,
-        in_channels=unet_params.in_channels,
-        out_channels=unet_params.out_channels,
-        down_block_types=tuple(down_block_types),
-        up_block_types=tuple(up_block_types),
-        block_out_channels=tuple(block_out_channels),
-        layers_per_block=unet_params.num_res_blocks,
-        cross_attention_dim=unet_params.context_dim,
-        attention_head_dim=unet_params.num_heads,
-    )
-
-    return config
-
-
-def create_vae_diffusers_config(original_config):
-    """
-    Creates a config for the diffusers based on the config of the LDM model.
-    """
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
-    _ = original_config.model.params.first_stage_config.params.embed_dim
-
-    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
-    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
-    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
-
-    config = dict(
-        sample_size=vae_params.resolution,
-        in_channels=vae_params.in_channels,
-        out_channels=vae_params.out_ch,
-        down_block_types=tuple(down_block_types),
-        up_block_types=tuple(up_block_types),
-        block_out_channels=tuple(block_out_channels),
-        latent_channels=vae_params.z_channels,
-        layers_per_block=vae_params.num_res_blocks,
-    )
-    return config
-
-
-def create_diffusers_schedular(original_config):
-    schedular = DDIMScheduler(
-        num_train_timesteps=original_config.model.params.timesteps,
-        beta_start=original_config.model.params.linear_start,
-        beta_end=original_config.model.params.linear_end,
-        beta_schedule="scaled_linear",
-    )
-    return schedular
-
-
-def create_ldm_bert_config(original_config):
-    bert_params = original_config.model.parms.cond_stage_config.params
-    config = LDMBertConfig(
-        d_model=bert_params.n_embed,
-        encoder_layers=bert_params.n_layer,
-        encoder_ffn_dim=bert_params.n_embed * 4,
-    )
-    return config
-
-
-def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False):
-    """
-    Takes a state dict and a config, and returns a converted checkpoint.
-    """
-
-    # extract state_dict for UNet
-    unet_state_dict = {}
-    keys = list(checkpoint.keys())
-
-    unet_key = "model.diffusion_model."
-    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
-    if sum(k.startswith("model_ema") for k in keys) > 100:
-        print(f"Checkpoint {path} has both EMA and non-EMA weights.")
-        if extract_ema:
-            print(
-                "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
-                " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
-            )
-            for key in keys:
-                if key.startswith("model.diffusion_model"):
-                    flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
-                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
-        else:
-            print(
-                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
-                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
-            )
-
-    for key in keys:
-        if key.startswith(unet_key):
-            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
-
-    new_checkpoint = {}
-
-    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
-    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
-    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
-    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
-
-    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
-    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
-
-    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
-    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
-    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
-    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
-
-    # Retrieves the keys for the input blocks only
-    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
-    input_blocks = {
-        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
-        for layer_id in range(num_input_blocks)
-    }
-
-    # Retrieves the keys for the middle blocks only
-    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
-    middle_blocks = {
-        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
-        for layer_id in range(num_middle_blocks)
-    }
-
-    # Retrieves the keys for the output blocks only
-    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
-    output_blocks = {
-        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
-        for layer_id in range(num_output_blocks)
-    }
-
-    for i in range(1, num_input_blocks):
-        block_id = (i - 1) // (config["layers_per_block"] + 1)
-        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
-
-        resnets = [
-            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
-        ]
-        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
-
-        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.op.weight"
-            )
-            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
-                f"input_blocks.{i}.0.op.bias"
-            )
-
-        paths = renew_resnet_paths(resnets)
-        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
-        assign_to_checkpoint(
-            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-        )
-
-        if len(attentions):
-            paths = renew_attention_paths(attentions)
-            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
-            assign_to_checkpoint(
-                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-            )
-
-    resnet_0 = middle_blocks[0]
-    attentions = middle_blocks[1]
-    resnet_1 = middle_blocks[2]
-
-    resnet_0_paths = renew_resnet_paths(resnet_0)
-    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
-
-    resnet_1_paths = renew_resnet_paths(resnet_1)
-    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
-
-    attentions_paths = renew_attention_paths(attentions)
-    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(
-        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-    )
-
-    for i in range(num_output_blocks):
-        block_id = i // (config["layers_per_block"] + 1)
-        layer_in_block_id = i % (config["layers_per_block"] + 1)
-        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
-        output_block_list = {}
-
-        for layer in output_block_layers:
-            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
-            if layer_id in output_block_list:
-                output_block_list[layer_id].append(layer_name)
-            else:
-                output_block_list[layer_id] = [layer_name]
-
-        if len(output_block_list) > 1:
-            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
-            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
-
-            resnet_0_paths = renew_resnet_paths(resnets)
-            paths = renew_resnet_paths(resnets)
-
-            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
-            assign_to_checkpoint(
-                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-            )
-
-            if ["conv.weight", "conv.bias"] in output_block_list.values():
-                index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.weight"
-                ]
-                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
-                    f"output_blocks.{i}.{index}.conv.bias"
-                ]
-
-                # Clear attentions as they have been attributed above.
-                if len(attentions) == 2:
-                    attentions = []
-
-            if len(attentions):
-                paths = renew_attention_paths(attentions)
-                meta_path = {
-                    "old": f"output_blocks.{i}.1",
-                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
-                }
-                assign_to_checkpoint(
-                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-                )
-        else:
-            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
-            for path in resnet_0_paths:
-                old_path = ".".join(["output_blocks", str(i), path["old"]])
-                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
-
-                new_checkpoint[new_path] = unet_state_dict[old_path]
-
-    return new_checkpoint
-
-
-def convert_ldm_vae_checkpoint(checkpoint, config):
-    # extract state dict for VAE
-    vae_state_dict = {}
-    keys = list(checkpoint.keys())
-    for key in keys:
-        vae_state_dict[key] = checkpoint.get(key)
-
-    new_checkpoint = {}
-
-    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
-    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
-    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
-    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
-    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
-    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
-
-    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
-    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
-    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
-    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
-    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
-    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
-
-    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
-    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
-    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
-    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
-
-    # Retrieves the keys for the encoder down blocks only
-    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
-    down_blocks = {
-        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
-    }
-
-    # Retrieves the keys for the decoder up blocks only
-    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
-    up_blocks = {
-        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
-    }
-
-    for i in range(num_down_blocks):
-        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
-
-        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.weight"
-            )
-            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
-                f"encoder.down.{i}.downsample.conv.bias"
-            )
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-
-    for i in range(num_up_blocks):
-        block_id = num_up_blocks - 1 - i
-        resnets = [
-            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
-        ]
-
-        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.weight"
-            ]
-            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
-                f"decoder.up.{block_id}.upsample.conv.bias"
-            ]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
-    num_mid_res_blocks = 2
-    for i in range(1, num_mid_res_blocks + 1):
-        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
-
-        paths = renew_vae_resnet_paths(resnets)
-        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
-    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
-    paths = renew_vae_attention_paths(mid_attentions)
-    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
-    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-    conv_attn_to_linear(new_checkpoint)
-    return new_checkpoint
-
-
-def convert_ldm_bert_checkpoint(checkpoint, config):
-    def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
-        hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight
-        hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight
-        hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight
-
-        hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight
-        hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias
-
-    def _copy_linear(hf_linear, pt_linear):
-        hf_linear.weight = pt_linear.weight
-        hf_linear.bias = pt_linear.bias
-
-    def _copy_layer(hf_layer, pt_layer):
-        # copy layer norms
-        _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0])
-        _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0])
-
-        # copy attn
-        _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1])
-
-        # copy MLP
-        pt_mlp = pt_layer[1][1]
-        _copy_linear(hf_layer.fc1, pt_mlp.net[0][0])
-        _copy_linear(hf_layer.fc2, pt_mlp.net[2])
-
-    def _copy_layers(hf_layers, pt_layers):
-        for i, hf_layer in enumerate(hf_layers):
-            if i != 0:
-                i += i
-            pt_layer = pt_layers[i : i + 2]
-            _copy_layer(hf_layer, pt_layer)
-
-    hf_model = LDMBertModel(config).eval()
-
-    # copy  embeds
-    hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight
-    hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight
-
-    # copy layer norm
-    _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm)
-
-    # copy hidden layers
-    _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers)
-
-    _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits)
-
-    return hf_model
-
-
-def convert_ldm_clip_checkpoint(checkpoint):
-    text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
-
-    keys = list(checkpoint.keys())
-
-    text_model_dict = {}
-
-    for key in keys:
-        if key.startswith("cond_stage_model.transformer"):
-            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
-
-    text_model.load_state_dict(text_model_dict)
-
-    return text_model
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--unet_checkpoint_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
-    )
-    parser.add_argument(
-        "--vae_checkpoint_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
-    )
-    parser.add_argument(
-        "--optimus_checkpoint_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
-    )
-    # !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
-    parser.add_argument(
-        "--original_config_file",
-        default=None,
-        type=str,
-        help="The YAML config file corresponding to the original architecture.",
-    )
-    parser.add_argument(
-        "--scheduler_type",
-        default="pndm",
-        type=str,
-        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancest', 'dpm']",
-    )
-    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
-
-    args = parser.parse_args()
-
-    if args.original_config_file is None:
-        os.system(
-            "wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
-        )
-        args.original_config_file = "./v1-inference.yaml"
-
-    original_config = OmegaConf.load(args.original_config_file)
-
-
-    num_train_timesteps = original_config.model.params.timesteps
-    beta_start = original_config.model.params.linear_start
-    beta_end = original_config.model.params.linear_end
-    if args.scheduler_type == "pndm":
-        scheduler = PNDMScheduler(
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            beta_start=beta_start,
-            num_train_timesteps=num_train_timesteps,
-            skip_prk_steps=True,
-        )
-    elif args.scheduler_type == "lms":
-        scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
-    elif args.scheduler_type == "euler":
-        scheduler = EulerDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
-    elif args.scheduler_type == "euler-ancestral":
-        scheduler = EulerAncestralDiscreteScheduler(
-            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
-        )
-    elif args.scheduler_type == "dpm":
-        scheduler = DPMSolverMultistepScheduler(
-            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
-        )
-    elif args.scheduler_type == "ddim":
-        scheduler = DDIMScheduler(
-            beta_start=beta_start,
-            beta_end=beta_end,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-    else:
-        raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
-
-    # Convert the UNet2DConditionModel model.
-#    checkpoint = torch.load(args.unet_checkpoint_path)
-#    unet_config = create_unet_diffusers_config(original_config)
-#    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
-#        checkpoint, unet_config, path=args.checkpoint_path, extract_ema=args.extract_ema
-#    )
-#
-#    unet = UNet2DConditionModel(**unet_config)
-#    unet.load_state_dict(converted_unet_checkpoint)
-
-    # Convert the VAE model.
-    if args.vae_checkpoint_path is not None:
-        vae_config = create_vae_diffusers_config(original_config)
-        checkpoint = torch.load(args.vae_checkpoint_path)
-        converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
-
-        vae = AutoencoderKL(**vae_config)
-        vae.load_state_dict(converted_vae_checkpoint)
-        vae.save_pretrained(os.path.join(args.dump_path, "vae"))
-
-    # Convert the text model.
-#    text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
-#    if text_model_type == "FrozenCLIPEmbedder":
-#        text_model = convert_ldm_clip_checkpoint(checkpoint)
-#        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-#        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
-#        feature_extractor = AutoFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker")
-#        pipe = StableDiffusionPipeline(
-#            vae=vae,
-#            text_encoder=text_model,
-#            tokenizer=tokenizer,
-#            unet=unet,
-#            scheduler=scheduler,
-#            safety_checker=safety_checker,
-#            feature_extractor=feature_extractor,
-#        )
-#    else:
-#        text_config = create_ldm_bert_config(original_config)
-#        text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
-#        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
-#        pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
-#
-#    pipe.save_pretrained(args.dump_path)
diff --git a/v1-inference.yaml b/v1-inference.yaml
deleted file mode 100644
index d4effe569e89..000000000000
--- a/v1-inference.yaml
+++ /dev/null
@@ -1,70 +0,0 @@
-model:
-  base_learning_rate: 1.0e-04
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false   # Note: different from the one we trained before
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-
-    scheduler_config: # 10000 warmup steps
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps: [ 10000 ]
-        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
-        f_start: [ 1.e-6 ]
-        f_max: [ 1. ]
-        f_min: [ 1. ]
-
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

From 8a7306457678dad1246ff767553c6200802828d4 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 15 Nov 2022 21:32:26 +0100
Subject: [PATCH 192/269] Add AltDiffusion (#1299)

* add conversion script for vae

* up

* up

* some fixes

* add text model

* use the correct config

* add docs

* move model in it's own file

* move model in its own file

* pass attenion mask to text encoder

* pass attn mask to uncond inputs

* quality

* fix image2image

* add imag2image in init

* fix import

* fix one more import

* fix import, dummy objetcs

* fix copied from

* up

* finish

Co-authored-by: patil-suraj <surajp815@gmail.com>
---
 docs/source/_toctree.yml                      |   2 +
 docs/source/api/pipelines/alt_diffusion.mdx   |  83 +++
 docs/source/api/pipelines/overview.mdx        |   1 +
 docs/source/index.mdx                         |   1 +
 .../conditional_image_generation.mdx          |   2 -
 src/diffusers/__init__.py                     |   2 +
 src/diffusers/pipelines/__init__.py           |   1 +
 .../pipelines/alt_diffusion/__init__.py       |  34 +
 .../alt_diffusion/modeling_roberta_series.py  | 110 ++++
 .../alt_diffusion/pipeline_alt_diffusion.py   | 533 ++++++++++++++++
 .../pipeline_alt_diffusion_img2img.py         | 579 ++++++++++++++++++
 .../pipeline_cycle_diffusion.py               |  24 +-
 .../pipeline_stable_diffusion.py              |  24 +-
 .../pipeline_stable_diffusion_img2img.py      |  24 +-
 .../pipeline_stable_diffusion_inpaint.py      |  24 +-
 ...ipeline_stable_diffusion_inpaint_legacy.py |  24 +-
 .../dummy_torch_and_transformers_objects.py   |  30 +
 17 files changed, 1486 insertions(+), 12 deletions(-)
 create mode 100644 docs/source/api/pipelines/alt_diffusion.mdx
 create mode 100644 src/diffusers/pipelines/alt_diffusion/__init__.py
 create mode 100644 src/diffusers/pipelines/alt_diffusion/modeling_roberta_series.py
 create mode 100644 src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
 create mode 100644 src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 0e8dec816774..4491a1eab657 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -80,6 +80,8 @@
   - sections:
     - local: api/pipelines/overview
       title: "Overview"
+    - local: api/pipelines/alt_diffusion
+      title: "AltDiffusion"
     - local: api/pipelines/cycle_diffusion
       title: "Cycle Diffusion"
     - local: api/pipelines/ddim
diff --git a/docs/source/api/pipelines/alt_diffusion.mdx b/docs/source/api/pipelines/alt_diffusion.mdx
new file mode 100644
index 000000000000..efa9beb8c072
--- /dev/null
+++ b/docs/source/api/pipelines/alt_diffusion.mdx
@@ -0,0 +1,83 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# AltDiffusion
+
+AltDiffusion was proposed in [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Zhongzhi Chen, Guang Liu, Bo-Wen Zhang, Fulong Ye, Qinghong Yang, Ledell Wu
+
+The abstract of the paper is the following:
+
+*In this work, we present a conceptually simple and effective method to train a strong bilingual multimodal representation model. Starting from the pretrained multimodal representation model CLIP released by OpenAI, we switched its text encoder with a pretrained multilingual text encoder XLM-R, and aligned both languages and image representations by a two-stage training schema consisting of teacher learning and contrastive learning. We validate our method through evaluations of a wide range of tasks. We set new state-of-the-art performances on a bunch of tasks including ImageNet-CN, Flicker30k- CN, and COCO-CN. Further, we obtain very close performances with CLIP on almost all tasks, suggesting that one can simply alter the text encoder in CLIP for extended capabilities such as multilingual understanding.*
+
+
+*Overview*:
+
+| Pipeline | Tasks | Colab | Demo
+|---|---|:---:|:---:|
+| [pipeline_alt_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py) | *Text-to-Image Generation* | - | -
+| [pipeline_alt_diffusion_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py) | *Image-to-Image Text-Guided Generation* | - |-
+
+## Tips
+
+- AltDiffusion is conceptually exaclty the same as [Stable Diffusion](./api/pipelines/stable_diffusion).
+
+- *Run AltDiffusion*
+
+AltDiffusion can be tested very easily with the [`AltDiffusionPipeline`], [`AltDiffusionImg2ImgPipeline`] and the `"BAAI/AltDiffusion"` checkpoint exactly in the same way it is shown in the [Conditional Image Generation Guide](./using-diffusers/conditional_image_generation) and the [Image-to-Image Generation Guide](./using-diffusers/img2img).
+
+- *How to load and use different schedulers.*
+
+The alt diffusion pipeline uses [`DDIMScheduler`] scheduler by default. But `diffusers` provides many other schedulers that can be used with the alt diffusion pipeline such as [`PNDMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`] etc.
+To use a different scheduler, you can either change it via the [`ConfigMixin.from_config`] method or pass the `scheduler` argument to the `from_pretrained` method of the pipeline. For example, to use the [`EulerDiscreteScheduler`], you can do the following:
+
+```python
+>>> from diffusers import AltDiffusionPipeline, EulerDiscreteScheduler
+
+>>> pipeline = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion")
+>>> pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
+
+>>> # or
+>>> euler_scheduler = EulerDiscreteScheduler.from_pretrained("BAAI/AltDiffusion", subfolder="scheduler")
+>>> pipeline = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", scheduler=euler_scheduler)
+```
+
+
+- *How to conver all use cases with multiple or single pipeline*
+
+If you want to use all possible use cases in a single `DiffusionPipeline` we recommend using the `components` functionality to instantiate all components in the most memory-efficient way:
+
+```python
+>>> from diffusers import (
+...     AltDiffusionPipeline,
+...     AltDiffusionImg2ImgPipeline,
+... )
+
+>>> img2text = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion")
+>>> img2img = AltDiffusionImg2ImgPipeline(**img2text.components)
+
+>>> # now you can use img2text(...) and img2img(...) just like the call methods of each respective pipeline
+```
+
+## AltDiffusionPipelineOutput
+[[autodoc]] pipelines.alt_diffusion.AltDiffusionPipelineOutput
+
+## AltDiffusionPipeline
+[[autodoc]] AltDiffusionPipeline
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+
+## AltDiffusionImg2ImgPipeline
+[[autodoc]] AltDiffusionImg2ImgPipeline
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
diff --git a/docs/source/api/pipelines/overview.mdx b/docs/source/api/pipelines/overview.mdx
index d68961a2fc06..d504ecbe47a8 100644
--- a/docs/source/api/pipelines/overview.mdx
+++ b/docs/source/api/pipelines/overview.mdx
@@ -44,6 +44,7 @@ available a colab notebook to directly try them out.
 
 | Pipeline | Paper | Tasks | Colab
 |---|---|:---:|:---:|
+| [alt_diffusion](./api/pipelines/alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation | -
 | [cycle_diffusion](./api/pipelines/cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
 | [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
 | [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index bae507ac11d3..6e145490642c 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -34,6 +34,7 @@ available a colab notebook to directly try them out.
 
 | Pipeline | Paper | Tasks | Colab
 |---|---|:---:|:---:|
+| [alt_diffusion](./api/pipelines/alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation |
 | [cycle_diffusion](./api/pipelines/cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation |
 | [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation |
 | [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
diff --git a/docs/source/using-diffusers/conditional_image_generation.mdx b/docs/source/using-diffusers/conditional_image_generation.mdx
index 6273a71d4c0a..5ed27ac9171c 100644
--- a/docs/source/using-diffusers/conditional_image_generation.mdx
+++ b/docs/source/using-diffusers/conditional_image_generation.mdx
@@ -44,5 +44,3 @@ You can save the image by simply calling:
 ```python
 >>> image.save("image_of_squirrel_painting.png")
 ```
-
-
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 86eda7371fe9..42cb2cb585c8 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -65,6 +65,8 @@
 
 if is_torch_available() and is_transformers_available():
     from .pipelines import (
+        AltDiffusionImg2ImgPipeline,
+        AltDiffusionPipeline,
         CycleDiffusionPipeline,
         LDMTextToImagePipeline,
         StableDiffusionImg2ImgPipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index ef4d23e5e6d0..3ca66b28b5f1 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -15,6 +15,7 @@
     from ..utils.dummy_pt_objects import *  # noqa F403
 
 if is_torch_available() and is_transformers_available():
+    from .alt_diffusion import AltDiffusionImg2ImgPipeline, AltDiffusionPipeline
     from .latent_diffusion import LDMTextToImagePipeline
     from .stable_diffusion import (
         CycleDiffusionPipeline,
diff --git a/src/diffusers/pipelines/alt_diffusion/__init__.py b/src/diffusers/pipelines/alt_diffusion/__init__.py
new file mode 100644
index 000000000000..09d0d9b7852c
--- /dev/null
+++ b/src/diffusers/pipelines/alt_diffusion/__init__.py
@@ -0,0 +1,34 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+
+import PIL
+from PIL import Image
+
+from ...utils import BaseOutput, is_torch_available, is_transformers_available
+
+
+@dataclass
+# Copied from diffusers.pipelines.stable_diffusion.__init__.StableDiffusionPipelineOutput with Stable->Alt
+class AltDiffusionPipelineOutput(BaseOutput):
+    """
+    Output class for Alt Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+        nsfw_content_detected (`List[bool]`)
+            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, or `None` if safety checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
+
+
+if is_transformers_available() and is_torch_available():
+    from .modeling_roberta_series import RobertaSeriesModelWithTransformation
+    from .pipeline_alt_diffusion import AltDiffusionPipeline
+    from .pipeline_alt_diffusion_img2img import AltDiffusionImg2ImgPipeline
diff --git a/src/diffusers/pipelines/alt_diffusion/modeling_roberta_series.py b/src/diffusers/pipelines/alt_diffusion/modeling_roberta_series.py
new file mode 100644
index 000000000000..2e92314162d3
--- /dev/null
+++ b/src/diffusers/pipelines/alt_diffusion/modeling_roberta_series.py
@@ -0,0 +1,110 @@
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+
+from transformers import RobertaPreTrainedModel, XLMRobertaConfig, XLMRobertaModel
+from transformers.utils import ModelOutput
+
+
+@dataclass
+class TransformationModelOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The text embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    projection_state: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class RobertaSeriesConfig(XLMRobertaConfig):
+    def __init__(
+        self,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        project_dim=512,
+        pooler_fn="cls",
+        learn_encoder=False,
+        use_attention_mask=True,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.project_dim = project_dim
+        self.pooler_fn = pooler_fn
+        self.learn_encoder = learn_encoder
+        self.use_attention_mask = use_attention_mask
+
+
+class RobertaSeriesModelWithTransformation(RobertaPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    base_model_prefix = "roberta"
+    config_class = RobertaSeriesConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.roberta = XLMRobertaModel(config)
+        self.transformation = nn.Linear(config.hidden_size, config.project_dim)
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ):
+        r""" """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.base_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        projection_state = self.transformation(outputs.last_hidden_state)
+
+        return TransformationModelOutput(
+            projection_state=projection_state,
+            last_hidden_state=outputs.last_hidden_state,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
new file mode 100644
index 000000000000..01b2051db410
--- /dev/null
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -0,0 +1,533 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import torch
+
+from diffusers.utils import is_accelerate_available
+from transformers import CLIPFeatureExtractor, XLMRobertaTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...pipeline_utils import DiffusionPipeline
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ...utils import deprecate, logging
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from . import AltDiffusionPipelineOutput, RobertaSeriesModelWithTransformation
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline with Stable->Alt, CLIPTextModel->RobertaSeriesModelWithTransformation, CLIPTokenizer->XLMRobertaTokenizer, AltDiffusionSafetyChecker->StableDiffusionSafetyChecker
+class AltDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Alt Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`RobertaSeriesModelWithTransformation`]):
+            Frozen text-encoder. Alt Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.RobertaSeriesModelWithTransformation),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`XLMRobertaTokenizer`):
+            Tokenizer of class
+            [XLMRobertaTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.XLMRobertaTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: RobertaSeriesModelWithTransformation,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warn(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Alt Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    def enable_xformers_memory_efficient_attention(self):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(False)
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    def enable_sequential_cpu_offload(self):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device("cuda")
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
+
+        if not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // 8, width // 8)
+        if latents is None:
+            if device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+            else:
+                latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        # 8. Post-processing
+        image = self.decode_latents(latents)
+
+        # 9. Run safety checker
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
+
+        # 10. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return AltDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
new file mode 100644
index 000000000000..5b675c121b48
--- /dev/null
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -0,0 +1,579 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+
+import PIL
+from diffusers.utils import is_accelerate_available
+from transformers import CLIPFeatureExtractor, XLMRobertaTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...pipeline_utils import DiffusionPipeline
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ...utils import deprecate, logging
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from . import AltDiffusionPipelineOutput, RobertaSeriesModelWithTransformation
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def preprocess(image):
+    w, h = image.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline with Stable->Alt, CLIPTextModel->RobertaSeriesModelWithTransformation, CLIPTokenizer->XLMRobertaTokenizer, AltDiffusionSafetyChecker->StableDiffusionSafetyChecker
+class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-guided image to image generation using Alt Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`RobertaSeriesModelWithTransformation`]):
+            Frozen text-encoder. Alt Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.RobertaSeriesModelWithTransformation),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`XLMRobertaTokenizer`):
+            Tokenizer of class
+            [XLMRobertaTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.XLMRobertaTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline.__init__
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: RobertaSeriesModelWithTransformation,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warn(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Alt Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline.enable_attention_slicing
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline.disable_attention_slicing
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline.enable_sequential_cpu_offload
+    def enable_sequential_cpu_offload(self):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device("cuda")
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline.enable_xformers_memory_efficient_attention
+    def enable_xformers_memory_efficient_attention(self):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline.disable_xformers_memory_efficient_attention
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(False)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline._encode_prompt
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
+
+        if not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(self, prompt, strength, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [1.0, 1.0] but is {strength}")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        offset = self.scheduler.config.get("steps_offset", 0)
+        init_timestep = int(num_inference_steps * strength) + offset
+        init_timestep = min(init_timestep, num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep + offset, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps
+
+    def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        init_image = init_image.to(device=device, dtype=dtype)
+        init_latent_dist = self.vae.encode(init_image).latent_dist
+        init_latents = init_latent_dist.sample(generator=generator)
+        init_latents = 0.18215 * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many init images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
+
+        # add noise to latents using the timesteps
+        noise = torch.randn(init_latents.shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        init_image: Union[torch.FloatTensor, PIL.Image.Image],
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            init_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
+                `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter will be modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 1. Check inputs
+        self.check_inputs(prompt, strength, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        # 4. Preprocess image
+        if isinstance(init_image, PIL.Image.Image):
+            init_image = preprocess(init_image)
+
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 6. Prepare latent variables
+        latents = self.prepare_latents(
+            init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        # 9. Post-processing
+        image = self.decode_latents(latents)
+
+        # 10. Run safety checker
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
+
+        # 11. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return AltDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index dfdb58de4d0f..ec14914eb4c2 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -301,7 +301,17 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
-        text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = text_embeddings.shape
@@ -337,7 +347,17 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
                 truncation=True,
                 return_tensors="pt",
             )
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = uncond_embeddings.shape[1]
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index e6353472933c..65922451f0d4 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -248,7 +248,17 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
-        text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = text_embeddings.shape
@@ -284,7 +294,17 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
                 truncation=True,
                 return_tensors="pt",
             )
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = uncond_embeddings.shape[1]
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 9df800dc2d61..d8d3e7098663 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -268,7 +268,17 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
-        text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = text_embeddings.shape
@@ -304,7 +314,17 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
                 truncation=True,
                 return_tensors="pt",
             )
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = uncond_embeddings.shape[1]
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index a122723eee6d..fea2b3e5a8ee 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -261,7 +261,17 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
-        text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = text_embeddings.shape
@@ -297,7 +307,17 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
                 truncation=True,
                 return_tensors="pt",
             )
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = uncond_embeddings.shape[1]
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 86d879eaa8c3..72def76f2887 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -281,7 +281,17 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
                 "The following part of your input was truncated because CLIP can only handle sequences up to"
                 f" {self.tokenizer.model_max_length} tokens: {removed_text}"
             )
-        text_embeddings = self.text_encoder(text_input_ids.to(device))[0]
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
 
         # duplicate text embeddings for each generation per prompt, using mps friendly method
         bs_embed, seq_len, _ = text_embeddings.shape
@@ -317,7 +327,17 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
                 truncation=True,
                 return_tensors="pt",
             )
-            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(device))[0]
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
 
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
             seq_len = uncond_embeddings.shape[1]
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 63e8a60f74c2..92c163ba74ea 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -4,6 +4,36 @@
 from ..utils import DummyObject, requires_backends
 
 
+class AltDiffusionImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AltDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class CycleDiffusionPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From af9ee8736c31cc37e3be639bfa26cf7d42ab4667 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 16 Nov 2022 10:28:19 +0100
Subject: [PATCH 193/269] Better error message for transformers dummy (#1306)

---
 src/diffusers/pipeline_utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index b4b1b9dd0c55..4ab1695683a7 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -57,6 +57,7 @@
 INDEX_FILE = "diffusion_pytorch_model.bin"
 CUSTOM_PIPELINE_FILE_NAME = "pipeline.py"
 DUMMY_MODULES_FOLDER = "diffusers.utils"
+TRANSFORMERS_DUMMY_MODULES_FOLDER = "transformers.utils"
 
 
 logger = logging.get_logger(__name__)
@@ -592,7 +593,10 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
                 if load_method_name is None:
                     none_module = class_obj.__module__
-                    if none_module.startswith(DUMMY_MODULES_FOLDER) and "dummy" in none_module:
+                    is_dummy_path = none_module.startswith(DUMMY_MODULES_FOLDER) or none_module.startswith(
+                        TRANSFORMERS_DUMMY_MODULES_FOLDER
+                    )
+                    if is_dummy_path and "dummy" in none_module:
                         # call class_obj for nice error message of missing requirements
                         class_obj()
 

From 327ddc877003dc7b0c989fda80515f15c09053ab Mon Sep 17 00:00:00 2001
From: Mishig <mishig.davaadorj@coloradocollege.edu>
Date: Wed, 16 Nov 2022 11:46:13 +0100
Subject: [PATCH 194/269] Revert "Update pr docs actions" (#1307)

Revert "Update pr docs actions (#1194)"

This reverts commit 32b0736d8ad7ec124affca3a00a266f5addcbd91.
---
 .github/workflows/build_pr_documentation.yml | 5 +----
 .github/workflows/delete_doc_comment.yml     | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index 542920d7f6ed..d51623e735c5 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -9,11 +9,8 @@ concurrency:
 
 jobs:
   build:
-    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@use_hf_hub
+    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
     with:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
       package: diffusers
-    secrets:
-      token: ${{ secrets.HF_DOC_PUSH }}
-      comment_bot_token: ${{ secrets.HUGGINGFACE_PUSH }}
diff --git a/.github/workflows/delete_doc_comment.yml b/.github/workflows/delete_doc_comment.yml
index e1b2da9567ed..238dc0bdbabf 100644
--- a/.github/workflows/delete_doc_comment.yml
+++ b/.github/workflows/delete_doc_comment.yml
@@ -7,10 +7,7 @@ on:
 
 jobs:
   delete:
-    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@use_hf_hub
+    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main
     with:
       pr_number: ${{ github.event.number }}
       package: diffusers
-    secrets:
-      token: ${{ secrets.HF_DOC_PUSH }}
-      comment_bot_token: ${{ secrets.HUGGINGFACE_PUSH }}

From 46893adacd4d821ff5c64d60b1cad6d163040b27 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Wed, 16 Nov 2022 15:40:26 +0100
Subject: [PATCH 195/269] [AltDiffusion] add tests  (#1311)

* being tests

* fix model ids

* don't use safety checker in tests

* add im2img2 tests

* fix integration tests

* integration tests

* style

* add sentencepiece in test dep

* quality

* 4 decimalk points

* fix im2img test

* increase the tok slightly
---
 setup.py                                      |   2 +
 src/diffusers/dependency_versions_table.py    |   1 +
 tests/pipelines/altdiffusion/__init__.py      |   0
 .../altdiffusion/test_alt_diffusion.py        | 347 ++++++++++++++++++
 .../test_alt_diffusion_img2img.py             | 256 +++++++++++++
 5 files changed, 606 insertions(+)
 create mode 100644 tests/pipelines/altdiffusion/__init__.py
 create mode 100644 tests/pipelines/altdiffusion/test_alt_diffusion.py
 create mode 100644 tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py

diff --git a/setup.py b/setup.py
index 1bb6af4b10bf..27836701b6d0 100644
--- a/setup.py
+++ b/setup.py
@@ -97,6 +97,7 @@
     "pytest",
     "pytest-timeout",
     "pytest-xdist",
+    "sentencepiece>=0.1.91,!=0.1.92",
     "scipy",
     "regex!=2019.12.17",
     "requests",
@@ -183,6 +184,7 @@ def run(self):
     "pytest",
     "pytest-timeout",
     "pytest-xdist",
+    "sentencepiece",
     "scipy",
     "torchvision",
     "transformers"
diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py
index 59e13da0f22b..202c890760bb 100644
--- a/src/diffusers/dependency_versions_table.py
+++ b/src/diffusers/dependency_versions_table.py
@@ -21,6 +21,7 @@
     "pytest": "pytest",
     "pytest-timeout": "pytest-timeout",
     "pytest-xdist": "pytest-xdist",
+    "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",
     "scipy": "scipy",
     "regex": "regex!=2019.12.17",
     "requests": "requests",
diff --git a/tests/pipelines/altdiffusion/__init__.py b/tests/pipelines/altdiffusion/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion.py b/tests/pipelines/altdiffusion/test_alt_diffusion.py
new file mode 100644
index 000000000000..b743d100ce73
--- /dev/null
+++ b/tests/pipelines/altdiffusion/test_alt_diffusion.py
@@ -0,0 +1,347 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import AltDiffusionPipeline, AutoencoderKL, DDIMScheduler, PNDMScheduler, UNet2DConditionModel
+from diffusers.pipelines.alt_diffusion.modeling_roberta_series import (
+    RobertaSeriesConfig,
+    RobertaSeriesModelWithTransformation,
+)
+from diffusers.utils import floats_tensor, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu
+from transformers import XLMRobertaTokenizer
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class AltDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @property
+    def dummy_image(self):
+        batch_size = 1
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
+        return image
+
+    @property
+    def dummy_cond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
+    @property
+    def dummy_cond_unet_inpaint(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=9,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
+    @property
+    def dummy_vae(self):
+        torch.manual_seed(0)
+        model = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        return model
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = RobertaSeriesConfig(
+            hidden_size=32,
+            project_dim=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            vocab_size=5002,
+        )
+        return RobertaSeriesModelWithTransformation(config)
+
+    @property
+    def dummy_extractor(self):
+        def extract(*args, **kwargs):
+            class Out:
+                def __init__(self):
+                    self.pixel_values = torch.ones([0])
+
+                def to(self, device):
+                    self.pixel_values.to(device)
+                    return self
+
+            return Out()
+
+        return extract
+
+    def test_alt_diffusion_ddim(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
+        tokenizer.model_max_length = 77
+
+        # make sure here that pndm scheduler skips prk
+        alt_pipe = AltDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        alt_pipe = alt_pipe.to(device)
+        alt_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A photo of an astronaut"
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = alt_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = alt_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array(
+            [0.49249017, 0.46064827, 0.4790093, 0.50883967, 0.4811985, 0.51540506, 0.5084924, 0.4860553, 0.47318557]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_alt_diffusion_pndm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
+        tokenizer.model_max_length = 77
+
+        # make sure here that pndm scheduler skips prk
+        alt_pipe = AltDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        alt_pipe = alt_pipe.to(device)
+        alt_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = alt_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = alt_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array(
+            [0.4786532, 0.45791715, 0.47507674, 0.50763345, 0.48375353, 0.515062, 0.51244247, 0.48673993, 0.47105807]
+        )
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
+    def test_alt_diffusion_fp16(self):
+        """Test that stable diffusion works with fp16"""
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
+        tokenizer.model_max_length = 77
+
+        # put models in fp16
+        unet = unet.half()
+        vae = vae.half()
+        bert = bert.half()
+
+        # make sure here that pndm scheduler skips prk
+        alt_pipe = AltDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        alt_pipe = alt_pipe.to(torch_device)
+        alt_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = alt_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images
+
+        assert image.shape == (1, 128, 128, 3)
+
+
+@slow
+@require_torch_gpu
+class AltDiffusionPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_alt_diffusion(self):
+        # make sure here that pndm scheduler skips prk
+        alt_pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", safety_checker=None)
+        alt_pipe = alt_pipe.to(torch_device)
+        alt_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        with torch.autocast("cuda"):
+            output = alt_pipe(
+                [prompt], generator=generator, guidance_scale=6.0, num_inference_steps=20, output_type="np"
+            )
+
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array(
+            [0.8720703, 0.87109375, 0.87402344, 0.87109375, 0.8779297, 0.8925781, 0.8823242, 0.8808594, 0.8613281]
+        )
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_alt_diffusion_fast_ddim(self):
+        scheduler = DDIMScheduler.from_pretrained("BAAI/AltDiffusion", subfolder="scheduler")
+
+        alt_pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", scheduler=scheduler, safety_checker=None)
+        alt_pipe = alt_pipe.to(torch_device)
+        alt_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+
+        with torch.autocast("cuda"):
+            output = alt_pipe([prompt], generator=generator, num_inference_steps=2, output_type="numpy")
+        image = output.images
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array(
+            [0.9267578, 0.9301758, 0.9013672, 0.9345703, 0.92578125, 0.94433594, 0.9423828, 0.9423828, 0.9160156]
+        )
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_alt_diffusion_text2img_pipeline_fp16(self):
+        torch.cuda.reset_peak_memory_stats()
+        model_id = "BAAI/AltDiffusion"
+        pipe = AltDiffusionPipeline.from_pretrained(
+            model_id, revision="fp16", torch_dtype=torch.float16, safety_checker=None
+        )
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "a photograph of an astronaut riding a horse"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output_chunked = pipe(
+            [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
+        )
+        image_chunked = output_chunked.images
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        with torch.autocast(torch_device):
+            output = pipe(
+                [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
+            )
+            image = output.images
+
+        # Make sure results are close enough
+        diff = np.abs(image_chunked.flatten() - image.flatten())
+        # They ARE different since ops are not run always at the same precision
+        # however, they should be extremely close.
+        assert diff.mean() < 2e-2
diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
new file mode 100644
index 000000000000..0dab14b31716
--- /dev/null
+++ b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
@@ -0,0 +1,256 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import AltDiffusionImg2ImgPipeline, AutoencoderKL, PNDMScheduler, UNet2DConditionModel
+from diffusers.pipelines.alt_diffusion.modeling_roberta_series import (
+    RobertaSeriesConfig,
+    RobertaSeriesModelWithTransformation,
+)
+from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu
+from transformers import XLMRobertaTokenizer
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class AltDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @property
+    def dummy_image(self):
+        batch_size = 1
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
+        return image
+
+    @property
+    def dummy_cond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
+    @property
+    def dummy_vae(self):
+        torch.manual_seed(0)
+        model = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        return model
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = RobertaSeriesConfig(
+            hidden_size=32,
+            project_dim=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=5006,
+        )
+        return RobertaSeriesModelWithTransformation(config)
+
+    @property
+    def dummy_extractor(self):
+        def extract(*args, **kwargs):
+            class Out:
+                def __init__(self):
+                    self.pixel_values = torch.ones([0])
+
+                def to(self, device):
+                    self.pixel_values.to(device)
+                    return self
+
+            return Out()
+
+        return extract
+
+    def test_stable_diffusion_img2img_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
+        tokenizer.model_max_length = 77
+
+        init_image = self.dummy_image.to(device)
+
+        # make sure here that pndm scheduler skips prk
+        alt_pipe = AltDiffusionImg2ImgPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        alt_pipe = alt_pipe.to(device)
+        alt_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = alt_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+        )
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = alt_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array(
+            [0.41293705, 0.38656747, 0.40876025, 0.4782187, 0.4656803, 0.41394007, 0.4142093, 0.47150758, 0.4570448]
+        )
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1.5e-3
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1.5e-3
+
+    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
+    def test_stable_diffusion_img2img_fp16(self):
+        """Test that stable diffusion img2img works with fp16"""
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
+        tokenizer.model_max_length = 77
+
+        init_image = self.dummy_image.to(torch_device)
+
+        # put models in fp16
+        unet = unet.half()
+        vae = vae.half()
+        bert = bert.half()
+
+        # make sure here that pndm scheduler skips prk
+        alt_pipe = AltDiffusionImg2ImgPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        alt_pipe = alt_pipe.to(torch_device)
+        alt_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = alt_pipe(
+            [prompt],
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+            init_image=init_image,
+        ).images
+
+        assert image.shape == (1, 32, 32, 3)
+
+
+@slow
+@require_torch_gpu
+class AltDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_diffusion_img2img_pipeline_default(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        )
+        init_image = init_image.resize((768, 512))
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/fantasy_landscape_alt.npy"
+        )
+
+        model_id = "BAAI/AltDiffusion"
+        pipe = AltDiffusionImg2ImgPipeline.from_pretrained(
+            model_id,
+            safety_checker=None,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "A fantasy landscape, trending on artstation"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            init_image=init_image,
+            strength=0.75,
+            guidance_scale=7.5,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 768, 3)
+        # img2img is flaky across GPUs even in fp32, so using MAE here
+        assert np.abs(expected_image - image).max() < 1e-3

From 65d136e067350dcea8cf9f72da1df79599e4bbb8 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 16 Nov 2022 15:58:22 +0100
Subject: [PATCH 196/269] Add improved handling of pil (#1309)

* Better error message for transformers dummy

* [PIL] Better deprecation functionality

* up
---
 .../source/api/pipelines/latent_diffusion.mdx |  4 ++--
 docs/source/api/pipelines/overview.mdx        |  1 +
 docs/source/index.mdx                         |  1 +
 examples/community/imagic_stable_diffusion.py |  4 ++--
 examples/community/lpw_stable_diffusion.py    |  6 +++---
 .../community/lpw_stable_diffusion_onnx.py    |  6 +++---
 .../textual_inversion/textual_inversion.py    | 10 ++++-----
 .../textual_inversion_flax.py                 | 10 ++++-----
 setup.py                                      |  2 +-
 src/diffusers/dependency_versions_table.py    |  2 +-
 .../pipeline_alt_diffusion_img2img.py         |  5 +++--
 ...peline_latent_diffusion_superresolution.py |  3 ++-
 .../pipeline_cycle_diffusion.py               |  4 ++--
 .../pipeline_onnx_stable_diffusion_img2img.py |  4 ++--
 .../pipeline_onnx_stable_diffusion_inpaint.py |  4 ++--
 .../pipeline_stable_diffusion_img2img.py      |  4 ++--
 ...ipeline_stable_diffusion_inpaint_legacy.py |  6 +++---
 src/diffusers/utils/__init__.py               |  1 +
 src/diffusers/utils/pil_utils.py              | 21 +++++++++++++++++++
 .../test_latent_diffusion_superresolution.py  |  5 ++---
 20 files changed, 64 insertions(+), 39 deletions(-)
 create mode 100644 src/diffusers/utils/pil_utils.py

diff --git a/docs/source/api/pipelines/latent_diffusion.mdx b/docs/source/api/pipelines/latent_diffusion.mdx
index 4ade13e67b15..370d014f5a10 100644
--- a/docs/source/api/pipelines/latent_diffusion.mdx
+++ b/docs/source/api/pipelines/latent_diffusion.mdx
@@ -39,9 +39,9 @@ The original codebase can be found [here](https://github.com/CompVis/latent-diff
 
 
 ## LDMTextToImagePipeline
-[[autodoc]] pipelines.latent_diffusion.pipeline_latent_diffusion.LDMTextToImagePipeline
+[[autodoc]] LDMTextToImagePipeline
     - __call__
 
 ## LDMSuperResolutionPipeline
-[[autodoc]] pipelines.latent_diffusion.pipeline_latent_diffusion_superresolution.LDMSuperResolutionPipeline
+[[autodoc]] LDMSuperResolutionPipeline
     - __call__
diff --git a/docs/source/api/pipelines/overview.mdx b/docs/source/api/pipelines/overview.mdx
index d504ecbe47a8..74c44fbccdf5 100644
--- a/docs/source/api/pipelines/overview.mdx
+++ b/docs/source/api/pipelines/overview.mdx
@@ -50,6 +50,7 @@ available a colab notebook to directly try them out.
 | [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
 | [ddim](./api/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
 | [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | 
+| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image | 
 | [latent_diffusion_uncond](./api/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | 
 | [pndm](./api/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation | 
 | [score_sde_ve](./api/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 6e145490642c..e4722bec6897 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -40,6 +40,7 @@ available a colab notebook to directly try them out.
 | [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation |
 | [ddim](./api/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation |
 | [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | 
+| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image | 
 | [latent_diffusion_uncond](./api/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | 
 | [pndm](./api/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation | 
 | [score_sde_ve](./api/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | 
diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py
index 0c95fb435848..d6d89283b194 100644
--- a/examples/community/imagic_stable_diffusion.py
+++ b/examples/community/imagic_stable_diffusion.py
@@ -17,7 +17,7 @@
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from diffusers.utils import logging
+from diffusers.utils import PIL_INTERPOLATION, logging
 from tqdm.auto import tqdm
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
@@ -28,7 +28,7 @@
 def preprocess(image):
     w, h = image.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
     image = np.array(image).astype(np.float32) / 255.0
     image = image[None].transpose(0, 3, 1, 2)
     image = torch.from_numpy(image)
diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py
index e4ee7bf3c6fa..8c5f5b46a768 100644
--- a/examples/community/lpw_stable_diffusion.py
+++ b/examples/community/lpw_stable_diffusion.py
@@ -12,7 +12,7 @@
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from diffusers.utils import deprecate, is_accelerate_available, logging
+from diffusers.utils import PIL_INTERPOLATION, deprecate, is_accelerate_available, logging
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 
@@ -358,7 +358,7 @@ def get_weighted_text_embeddings(
 def preprocess_image(image):
     w, h = image.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
     image = np.array(image).astype(np.float32) / 255.0
     image = image[None].transpose(0, 3, 1, 2)
     image = torch.from_numpy(image)
@@ -369,7 +369,7 @@ def preprocess_mask(mask):
     mask = mask.convert("L")
     w, h = mask.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST)
+    mask = mask.resize((w // 8, h // 8), resample=PIL_INTERPOLATION["nearest"])
     mask = np.array(mask).astype(np.float32) / 255.0
     mask = np.tile(mask, (4, 1, 1))
     mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
diff --git a/examples/community/lpw_stable_diffusion_onnx.py b/examples/community/lpw_stable_diffusion_onnx.py
index 12e306a612e1..268af775a3cd 100644
--- a/examples/community/lpw_stable_diffusion_onnx.py
+++ b/examples/community/lpw_stable_diffusion_onnx.py
@@ -10,7 +10,7 @@
 from diffusers.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from diffusers.utils import logging
+from diffusers.utils import PIL_INTERPOLATION, logging
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 
 
@@ -365,7 +365,7 @@ def get_weighted_text_embeddings(
 def preprocess_image(image):
     w, h = image.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
     image = np.array(image).astype(np.float32) / 255.0
     image = image[None].transpose(0, 3, 1, 2)
     return 2.0 * image - 1.0
@@ -375,7 +375,7 @@ def preprocess_mask(mask):
     mask = mask.convert("L")
     w, h = mask.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST)
+    mask = mask.resize((w // 8, h // 8), resample=PIL_INTERPOLATION["nearest"])
     mask = np.array(mask).astype(np.float32) / 255.0
     mask = np.tile(mask, (4, 1, 1))
     mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py
index fc9380edcd73..532ce4a74139 100644
--- a/examples/textual_inversion/textual_inversion.py
+++ b/examples/textual_inversion/textual_inversion.py
@@ -12,13 +12,13 @@
 import torch.utils.checkpoint
 from torch.utils.data import Dataset
 
-import PIL
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import set_seed
 from diffusers import AutoencoderKL, DDPMScheduler, PNDMScheduler, StableDiffusionPipeline, UNet2DConditionModel
 from diffusers.optimization import get_scheduler
 from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from diffusers.utils import PIL_INTERPOLATION
 from huggingface_hub import HfFolder, Repository, whoami
 from PIL import Image
 from torchvision import transforms
@@ -260,10 +260,10 @@ def __init__(
             self._length = self.num_images * repeats
 
         self.interpolation = {
-            "linear": PIL.Image.LINEAR,
-            "bilinear": PIL.Image.BILINEAR,
-            "bicubic": PIL.Image.BICUBIC,
-            "lanczos": PIL.Image.LANCZOS,
+            "linear": PIL_INTERPOLATION["linear"],
+            "bilinear": PIL_INTERPOLATION["bilinear"],
+            "bicubic": PIL_INTERPOLATION["bicubic"],
+            "lanczos": PIL_INTERPOLATION["lanczos"],
         }[interpolation]
 
         self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small
diff --git a/examples/textual_inversion/textual_inversion_flax.py b/examples/textual_inversion/textual_inversion_flax.py
index be2b7ffb5490..008fe812c92f 100644
--- a/examples/textual_inversion/textual_inversion_flax.py
+++ b/examples/textual_inversion/textual_inversion_flax.py
@@ -14,7 +14,6 @@
 import jax
 import jax.numpy as jnp
 import optax
-import PIL
 import transformers
 from diffusers import (
     FlaxAutoencoderKL,
@@ -24,6 +23,7 @@
     FlaxUNet2DConditionModel,
 )
 from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionSafetyChecker
+from diffusers.utils import PIL_INTERPOLATION
 from flax import jax_utils
 from flax.training import train_state
 from flax.training.common_utils import shard
@@ -246,10 +246,10 @@ def __init__(
             self._length = self.num_images * repeats
 
         self.interpolation = {
-            "linear": PIL.Image.LINEAR,
-            "bilinear": PIL.Image.BILINEAR,
-            "bicubic": PIL.Image.BICUBIC,
-            "lanczos": PIL.Image.LANCZOS,
+            "linear": PIL_INTERPOLATION["linear"],
+            "bilinear": PIL_INTERPOLATION["bilinear"],
+            "bicubic": PIL_INTERPOLATION["bicubic"],
+            "lanczos": PIL_INTERPOLATION["lanczos"],
         }[interpolation]
 
         self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small
diff --git a/setup.py b/setup.py
index 27836701b6d0..d0aff10da63e 100644
--- a/setup.py
+++ b/setup.py
@@ -78,7 +78,7 @@
 # 1. all dependencies should be listed here with their version requirements if any
 # 2. once modified, run: `make deps_table_update` to update src/diffusers/dependency_versions_table.py
 _deps = [
-    "Pillow<10.0",  # keep the PIL.Image.Resampling deprecation away
+    "Pillow",  # keep the PIL.Image.Resampling deprecation away
     "accelerate>=0.11.0",
     "black==22.8",
     "datasets",
diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py
index 202c890760bb..d187b7914534 100644
--- a/src/diffusers/dependency_versions_table.py
+++ b/src/diffusers/dependency_versions_table.py
@@ -2,7 +2,7 @@
 # 1. modify the `_deps` dict in setup.py
 # 2. run `make deps_table_update``
 deps = {
-    "Pillow": "Pillow<10.0",
+    "Pillow": "Pillow",
     "accelerate": "accelerate>=0.11.0",
     "black": "black==22.8",
     "datasets": "datasets",
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 5b675c121b48..294a43e86e61 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -33,7 +33,7 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from ...utils import deprecate, logging
+from ...utils import PIL_INTERPOLATION, deprecate, logging
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from . import AltDiffusionPipelineOutput, RobertaSeriesModelWithTransformation
 
@@ -41,10 +41,11 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
 def preprocess(image):
     w, h = image.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
     image = np.array(image).astype(np.float32) / 255.0
     image = image[None].transpose(0, 3, 1, 2)
     image = torch.from_numpy(image)
diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
index 044ff359e317..b296a4953f97 100644
--- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
@@ -17,12 +17,13 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
+from ...utils import PIL_INTERPOLATION
 
 
 def preprocess(image):
     w, h = image.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
     image = np.array(image).astype(np.float32) / 255.0
     image = image[None].transpose(0, 3, 1, 2)
     image = torch.from_numpy(image)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index ec14914eb4c2..b5f409929234 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -26,7 +26,7 @@
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...pipeline_utils import DiffusionPipeline
 from ...schedulers import DDIMScheduler
-from ...utils import deprecate, logging
+from ...utils import PIL_INTERPOLATION, deprecate, logging
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
 
@@ -37,7 +37,7 @@
 def preprocess(image):
     w, h = image.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
     image = np.array(image).astype(np.float32) / 255.0
     image = image[None].transpose(0, 3, 1, 2)
     image = torch.from_numpy(image)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
index 483b5fd2d344..8b4f78c49787 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -25,7 +25,7 @@
 from ...onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
 from ...pipeline_utils import DiffusionPipeline
 from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from ...utils import deprecate, logging
+from ...utils import PIL_INTERPOLATION, deprecate, logging
 from . import StableDiffusionPipelineOutput
 
 
@@ -35,7 +35,7 @@
 def preprocess(image):
     w, h = image.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
     image = np.array(image).astype(np.float32) / 255.0
     image = image[None].transpose(0, 3, 1, 2)
     return 2.0 * image - 1.0
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
index 8e5c20131935..6228824b3d24 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -25,7 +25,7 @@
 from ...onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
 from ...pipeline_utils import DiffusionPipeline
 from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from ...utils import deprecate, logging
+from ...utils import PIL_INTERPOLATION, deprecate, logging
 from . import StableDiffusionPipelineOutput
 
 
@@ -44,7 +44,7 @@ def prepare_mask_and_masked_image(image, mask, latents_shape):
     image_mask = np.array(mask.convert("L").resize((latents_shape[1] * 8, latents_shape[0] * 8)))
     masked_image = image * (image_mask < 127.5)
 
-    mask = mask.resize((latents_shape[1], latents_shape[0]), PIL.Image.NEAREST)
+    mask = mask.resize((latents_shape[1], latents_shape[0]), PIL_INTERPOLATION["nearest"])
     mask = np.array(mask.convert("L"))
     mask = mask.astype(np.float32) / 255.0
     mask = mask[None, None]
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index d8d3e7098663..4bfbc5fbcb6e 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -33,7 +33,7 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from ...utils import deprecate, logging
+from ...utils import PIL_INTERPOLATION, deprecate, logging
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
 
@@ -44,7 +44,7 @@
 def preprocess(image):
     w, h = image.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
     image = np.array(image).astype(np.float32) / 255.0
     image = image[None].transpose(0, 3, 1, 2)
     image = torch.from_numpy(image)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 72def76f2887..5c2a3e952375 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -33,7 +33,7 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from ...utils import deprecate, logging
+from ...utils import PIL_INTERPOLATION, deprecate, logging
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
 
@@ -44,7 +44,7 @@
 def preprocess_image(image):
     w, h = image.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
     image = np.array(image).astype(np.float32) / 255.0
     image = image[None].transpose(0, 3, 1, 2)
     image = torch.from_numpy(image)
@@ -55,7 +55,7 @@ def preprocess_mask(mask):
     mask = mask.convert("L")
     w, h = mask.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST)
+    mask = mask.resize((w // 8, h // 8), resample=PIL_INTERPOLATION["nearest"])
     mask = np.array(mask).astype(np.float32) / 255.0
     mask = np.tile(mask, (4, 1, 1))
     mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index a80d24949899..909d878ed614 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -38,6 +38,7 @@
 )
 from .logging import get_logger
 from .outputs import BaseOutput
+from .pil_utils import PIL_INTERPOLATION
 
 
 if is_torch_available():
diff --git a/src/diffusers/utils/pil_utils.py b/src/diffusers/utils/pil_utils.py
new file mode 100644
index 000000000000..39d0a15a4e2f
--- /dev/null
+++ b/src/diffusers/utils/pil_utils.py
@@ -0,0 +1,21 @@
+import PIL.Image
+import PIL.ImageOps
+from packaging import version
+
+
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
index f402d2f2a70e..c04210dede31 100644
--- a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
+++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
@@ -19,9 +19,8 @@
 import numpy as np
 import torch
 
-import PIL
 from diffusers import DDIMScheduler, LDMSuperResolutionPipeline, UNet2DModel, VQModel
-from diffusers.utils import floats_tensor, load_image, slow, torch_device
+from diffusers.utils import PIL_INTERPOLATION, floats_tensor, load_image, slow, torch_device
 from diffusers.utils.testing_utils import require_torch
 
 from ...test_pipelines_common import PipelineTesterMixin
@@ -97,7 +96,7 @@ def test_inference_superresolution(self):
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
             "/vq_diffusion/teddy_bear_pool.png"
         )
-        init_image = init_image.resize((64, 64), resample=PIL.Image.LANCZOS)
+        init_image = init_image.resize((64, 64), resample=PIL_INTERPOLATION["lanczos"])
 
         ldm = LDMSuperResolutionPipeline.from_pretrained("duongna/ldm-super-resolution", device_map="auto")
         ldm.to(torch_device)

From 09d0546ad0489b84cd8930c2267d9097fa4f19e2 Mon Sep 17 00:00:00 2001
From: dblunk88 <39381389+dblunk88@users.noreply.github.com>
Date: Wed, 16 Nov 2022 11:40:16 -0500
Subject: [PATCH 197/269] cpu offloading: mutli GPU support (#1143)

mutli GPU support
---
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 65922451f0d4..963d75c58bdd 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -178,7 +178,7 @@ def disable_attention_slicing(self):
         # set slice_size = `None` to disable `attention slicing`
         self.enable_attention_slicing(None)
 
-    def enable_sequential_cpu_offload(self):
+    def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
         text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
@@ -189,7 +189,7 @@ def enable_sequential_cpu_offload(self):
         else:
             raise ImportError("Please install accelerate via `pip install accelerate`")
 
-        device = torch.device("cuda")
+        device = torch.device(f"cuda:{gpu_id}")
 
         for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
             if cpu_offloaded_model is not None:

From f1fcfdeec5ae98b30b0939baf9e64f47c813da99 Mon Sep 17 00:00:00 2001
From: Will Berman <wlbberman@gmail.com>
Date: Wed, 16 Nov 2022 08:51:43 -0800
Subject: [PATCH 198/269] vq diffusion classifier free sampling (#1294)

* vq diffusion classifier free sampling

* correct

* uP

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 scripts/convert_vq_diffusion_to_diffusers.py  |  44 +++++-
 .../pipelines/vq_diffusion/__init__.py        |   6 +-
 .../vq_diffusion/pipeline_vq_diffusion.py     | 142 ++++++++++++++----
 .../vq_diffusion/test_vq_diffusion.py         |  69 ++++++++-
 4 files changed, 220 insertions(+), 41 deletions(-)

diff --git a/scripts/convert_vq_diffusion_to_diffusers.py b/scripts/convert_vq_diffusion_to_diffusers.py
index ae105e30362e..85db67844acd 100644
--- a/scripts/convert_vq_diffusion_to_diffusers.py
+++ b/scripts/convert_vq_diffusion_to_diffusers.py
@@ -39,8 +39,8 @@
 
 import yaml
 from accelerate import init_empty_weights, load_checkpoint_and_dispatch
-from diffusers import VQDiffusionPipeline, VQDiffusionScheduler, VQModel
-from diffusers.models.attention import Transformer2DModel
+from diffusers import Transformer2DModel, VQDiffusionPipeline, VQDiffusionScheduler, VQModel
+from diffusers.pipelines.vq_diffusion.pipeline_vq_diffusion import LearnedClassifierFreeSamplingEmbeddings
 from transformers import CLIPTextModel, CLIPTokenizer
 from yaml.loader import FullLoader
 
@@ -826,6 +826,20 @@ def read_config_file(filename):
         transformer_model, checkpoint
     )
 
+    # classifier free sampling embeddings interlude
+
+    # The learned embeddings are stored on the transformer in the original VQ-diffusion. We store them on a separate
+    # model, so we pull them off the checkpoint before the checkpoint is deleted.
+
+    learnable_classifier_free_sampling_embeddings = diffusion_config.params.learnable_cf
+
+    if learnable_classifier_free_sampling_embeddings:
+        learned_classifier_free_sampling_embeddings_embeddings = checkpoint["transformer.empty_text_embed"]
+    else:
+        learned_classifier_free_sampling_embeddings_embeddings = None
+
+    # done classifier free sampling embeddings interlude
+
     with tempfile.NamedTemporaryFile() as diffusers_transformer_checkpoint_file:
         torch.save(diffusers_transformer_checkpoint, diffusers_transformer_checkpoint_file.name)
         del diffusers_transformer_checkpoint
@@ -871,6 +885,31 @@ def read_config_file(filename):
 
     # done scheduler
 
+    # learned classifier free sampling embeddings
+
+    with init_empty_weights():
+        learned_classifier_free_sampling_embeddings_model = LearnedClassifierFreeSamplingEmbeddings(
+            learnable_classifier_free_sampling_embeddings,
+            hidden_size=text_encoder_model.config.hidden_size,
+            length=tokenizer_model.model_max_length,
+        )
+
+    learned_classifier_free_sampling_checkpoint = {
+        "embeddings": learned_classifier_free_sampling_embeddings_embeddings.float()
+    }
+
+    with tempfile.NamedTemporaryFile() as learned_classifier_free_sampling_checkpoint_file:
+        torch.save(learned_classifier_free_sampling_checkpoint, learned_classifier_free_sampling_checkpoint_file.name)
+        del learned_classifier_free_sampling_checkpoint
+        del learned_classifier_free_sampling_embeddings_embeddings
+        load_checkpoint_and_dispatch(
+            learned_classifier_free_sampling_embeddings_model,
+            learned_classifier_free_sampling_checkpoint_file.name,
+            device_map="auto",
+        )
+
+    # done learned classifier free sampling embeddings
+
     print(f"saving VQ diffusion model, path: {args.dump_path}")
 
     pipe = VQDiffusionPipeline(
@@ -878,6 +917,7 @@ def read_config_file(filename):
         transformer=transformer_model,
         tokenizer=tokenizer_model,
         text_encoder=text_encoder_model,
+        learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings_model,
         scheduler=scheduler_model,
     )
     pipe.save_pretrained(args.dump_path)
diff --git a/src/diffusers/pipelines/vq_diffusion/__init__.py b/src/diffusers/pipelines/vq_diffusion/__init__.py
index edf6f570f5bf..8c9f14f00064 100644
--- a/src/diffusers/pipelines/vq_diffusion/__init__.py
+++ b/src/diffusers/pipelines/vq_diffusion/__init__.py
@@ -1 +1,5 @@
-from .pipeline_vq_diffusion import VQDiffusionPipeline
+from ...utils import is_torch_available, is_transformers_available
+
+
+if is_transformers_available() and is_torch_available():
+    from .pipeline_vq_diffusion import LearnedClassifierFreeSamplingEmbeddings, VQDiffusionPipeline
diff --git a/src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py b/src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
index 6e5325ba7ef5..333599d7ecf8 100644
--- a/src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
+++ b/src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py
@@ -20,6 +20,8 @@
 from diffusers.schedulers.scheduling_vq_diffusion import VQDiffusionScheduler
 from transformers import CLIPTextModel, CLIPTokenizer
 
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...modeling_utils import ModelMixin
 from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
 from ...utils import logging
 
@@ -27,6 +29,28 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
+class LearnedClassifierFreeSamplingEmbeddings(ModelMixin, ConfigMixin):
+    """
+    Utility class for storing learned text embeddings for classifier free sampling
+    """
+
+    @register_to_config
+    def __init__(self, learnable: bool, hidden_size: Optional[int] = None, length: Optional[int] = None):
+        super().__init__()
+
+        self.learnable = learnable
+
+        if self.learnable:
+            assert hidden_size is not None, "learnable=True requires `hidden_size` to be set"
+            assert length is not None, "learnable=True requires `length` to be set"
+
+            embeddings = torch.zeros(length, hidden_size)
+        else:
+            embeddings = None
+
+        self.embeddings = torch.nn.Parameter(embeddings)
+
+
 class VQDiffusionPipeline(DiffusionPipeline):
     r"""
     Pipeline for text-to-image generation using VQ Diffusion
@@ -55,6 +79,7 @@ class VQDiffusionPipeline(DiffusionPipeline):
     text_encoder: CLIPTextModel
     tokenizer: CLIPTokenizer
     transformer: Transformer2DModel
+    learned_classifier_free_sampling_embeddings: LearnedClassifierFreeSamplingEmbeddings
     scheduler: VQDiffusionScheduler
 
     def __init__(
@@ -64,6 +89,7 @@ def __init__(
         tokenizer: CLIPTokenizer,
         transformer: Transformer2DModel,
         scheduler: VQDiffusionScheduler,
+        learned_classifier_free_sampling_embeddings: LearnedClassifierFreeSamplingEmbeddings,
     ):
         super().__init__()
 
@@ -73,13 +99,78 @@ def __init__(
             text_encoder=text_encoder,
             tokenizer=tokenizer,
             scheduler=scheduler,
+            learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings,
+        )
+
+    def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
         )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+
+        # NOTE: This additional step of normalizing the text embeddings is from VQ-Diffusion.
+        # While CLIP does normalize the pooled output of the text transformer when combining
+        # the image and text embeddings, CLIP does not directly normalize the last hidden state.
+        #
+        # CLIP normalizing the pooled output.
+        # https://github.com/huggingface/transformers/blob/d92e22d1f28324f513f3080e5c47c071a3916721/src/transformers/models/clip/modeling_clip.py#L1052-L1053
+        text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
+
+        # duplicate text embeddings for each generation per prompt
+        text_embeddings = text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            if self.learned_classifier_free_sampling_embeddings.learnable:
+                uncond_embeddings = self.learned_classifier_free_sampling_embeddings.embeddings
+                uncond_embeddings = uncond_embeddings.unsqueeze(0).repeat(batch_size, 1, 1)
+            else:
+                uncond_tokens = [""] * batch_size
+
+                max_length = text_input_ids.shape[-1]
+                uncond_input = self.tokenizer(
+                    uncond_tokens,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+                # See comment for normalizing text embeddings
+                uncond_embeddings = uncond_embeddings / uncond_embeddings.norm(dim=-1, keepdim=True)
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
 
     @torch.no_grad()
     def __call__(
         self,
         prompt: Union[str, List[str]],
         num_inference_steps: int = 100,
+        guidance_scale: float = 5.0,
         truncation_rate: float = 1.0,
         num_images_per_prompt: int = 1,
         generator: Optional[torch.Generator] = None,
@@ -98,6 +189,12 @@ def __call__(
             num_inference_steps (`int`, *optional*, defaults to 100):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
             truncation_rate (`float`, *optional*, defaults to 1.0 (equivalent to no truncation)):
                 Used to "truncate" the predicted classes for x_0 such that the cumulative probability for a pixel is at
                 most `truncation_rate`. The lowest probabilities that would increase the cumulative probability above
@@ -137,6 +234,10 @@ def __call__(
 
         batch_size = batch_size * num_images_per_prompt
 
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        text_embeddings = self._encode_prompt(prompt, num_images_per_prompt, do_classifier_free_guidance)
+
         if (callback_steps is None) or (
             callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
         ):
@@ -145,35 +246,6 @@ def __call__(
                 f" {type(callback_steps)}."
             )
 
-        # get prompt text embeddings
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=self.tokenizer.model_max_length,
-            return_tensors="pt",
-        )
-        text_input_ids = text_inputs.input_ids
-
-        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
-            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
-            logger.warning(
-                "The following part of your input was truncated because CLIP can only handle sequences up to"
-                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-            )
-            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
-        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
-
-        # NOTE: This additional step of normalizing the text embeddings is from VQ-Diffusion.
-        # While CLIP does normalize the pooled output of the text transformer when combining
-        # the image and text embeddings, CLIP does not directly normalize the last hidden state.
-        #
-        # CLIP normalizing the pooled output.
-        # https://github.com/huggingface/transformers/blob/d92e22d1f28324f513f3080e5c47c071a3916721/src/transformers/models/clip/modeling_clip.py#L1052-L1053
-        text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
-
-        # duplicate text embeddings for each generation per prompt
-        text_embeddings = text_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
-
         # get the initial completely masked latents unless the user supplied it
 
         latents_shape = (batch_size, self.transformer.num_latent_pixels)
@@ -198,9 +270,19 @@ def __call__(
         sample = latents
 
         for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the sample if we are doing classifier free guidance
+            latent_model_input = torch.cat([sample] * 2) if do_classifier_free_guidance else sample
+
             # predict the un-noised image
             # model_output == `log_p_x_0`
-            model_output = self.transformer(sample, encoder_hidden_states=text_embeddings, timestep=t).sample
+            model_output = self.transformer(
+                latent_model_input, encoder_hidden_states=text_embeddings, timestep=t
+            ).sample
+
+            if do_classifier_free_guidance:
+                model_output_uncond, model_output_text = model_output.chunk(2)
+                model_output = model_output_uncond + guidance_scale * (model_output_text - model_output_uncond)
+                model_output -= torch.logsumexp(model_output, dim=1, keepdim=True)
 
             model_output = self.truncate(model_output, truncation_rate)
 
diff --git a/tests/pipelines/vq_diffusion/test_vq_diffusion.py b/tests/pipelines/vq_diffusion/test_vq_diffusion.py
index 5eb32d40d4f3..87e29cbc97de 100644
--- a/tests/pipelines/vq_diffusion/test_vq_diffusion.py
+++ b/tests/pipelines/vq_diffusion/test_vq_diffusion.py
@@ -20,7 +20,8 @@
 import torch
 
 from diffusers import Transformer2DModel, VQDiffusionPipeline, VQDiffusionScheduler, VQModel
-from diffusers.utils import load_image, slow, torch_device
+from diffusers.pipelines.vq_diffusion.pipeline_vq_diffusion import LearnedClassifierFreeSamplingEmbeddings
+from diffusers.utils import load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
@@ -45,6 +46,10 @@ def num_embed(self):
     def num_embeds_ada_norm(self):
         return 12
 
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+
     @property
     def dummy_vqvae(self):
         torch.manual_seed(0)
@@ -71,7 +76,7 @@ def dummy_text_encoder(self):
         config = CLIPTextConfig(
             bos_token_id=0,
             eos_token_id=2,
-            hidden_size=32,
+            hidden_size=self.text_embedder_hidden_size,
             intermediate_size=37,
             layer_norm_eps=1e-05,
             num_attention_heads=4,
@@ -111,9 +116,15 @@ def test_vq_diffusion(self):
         tokenizer = self.dummy_tokenizer
         transformer = self.dummy_transformer
         scheduler = VQDiffusionScheduler(self.num_embed)
+        learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings(learnable=False)
 
         pipe = VQDiffusionPipeline(
-            vqvae=vqvae, text_encoder=text_encoder, tokenizer=tokenizer, transformer=transformer, scheduler=scheduler
+            vqvae=vqvae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler,
+            learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings,
         )
         pipe = pipe.to(device)
         pipe.set_progress_bar_config(disable=None)
@@ -139,6 +150,50 @@ def test_vq_diffusion(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
+    def test_vq_diffusion_classifier_free_sampling(self):
+        device = "cpu"
+
+        vqvae = self.dummy_vqvae
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        transformer = self.dummy_transformer
+        scheduler = VQDiffusionScheduler(self.num_embed)
+        learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings(
+            learnable=True, hidden_size=self.text_embedder_hidden_size, length=tokenizer.model_max_length
+        )
+
+        pipe = VQDiffusionPipeline(
+            vqvae=vqvae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler,
+            learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings,
+        )
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "teddy bear playing in the pool"
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = pipe([prompt], generator=generator, num_inference_steps=2, output_type="np")
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = pipe(
+            [prompt], generator=generator, output_type="np", return_dict=False, num_inference_steps=2
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 24, 24, 3)
+
+        expected_slice = np.array([0.6647, 0.6531, 0.5303, 0.5891, 0.5726, 0.4439, 0.6304, 0.5564, 0.4912])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
 
 @slow
 @require_torch_gpu
@@ -149,12 +204,11 @@ def tearDown(self):
         gc.collect()
         torch.cuda.empty_cache()
 
-    def test_vq_diffusion(self):
-        expected_image = load_image(
+    def test_vq_diffusion_classifier_free_sampling(self):
+        expected_image = load_numpy(
             "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/vq_diffusion/teddy_bear_pool.png"
+            "/vq_diffusion/teddy_bear_pool_classifier_free_sampling.npy"
         )
-        expected_image = np.array(expected_image, dtype=np.float32) / 255.0
 
         pipeline = VQDiffusionPipeline.from_pretrained("microsoft/vq-diffusion-ithq")
         pipeline = pipeline.to(torch_device)
@@ -163,7 +217,6 @@ def test_vq_diffusion(self):
         generator = torch.Generator(device=torch_device).manual_seed(0)
         output = pipeline(
             "teddy bear playing in the pool",
-            truncation_rate=0.86,
             num_images_per_prompt=1,
             generator=generator,
             output_type="np",

From aa5c4c26092773e87cdd1c7563025f6764940845 Mon Sep 17 00:00:00 2001
From: Kamal Raj <kamalraj97@gmail.com>
Date: Wed, 16 Nov 2022 22:33:44 +0530
Subject: [PATCH 199/269] doc string args shape fix (#1243)

* doc string args shape fix

* fix styling
---
 src/diffusers/models/unet_2d_condition.py      | 3 ++-
 src/diffusers/models/unet_2d_condition_flax.py | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index becae7568343..c3f2fb87b60d 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -251,7 +251,8 @@ def forward(
         Args:
             sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
             timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
-            encoder_hidden_states (`torch.FloatTensor`): (batch, channel, height, width) encoder hidden states
+            encoder_hidden_states (`torch.FloatTensor`):
+                (batch_size, sequence_length, hidden_size) encoder hidden states
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
 
diff --git a/src/diffusers/models/unet_2d_condition_flax.py b/src/diffusers/models/unet_2d_condition_flax.py
index f0e721826bd7..7ca9c191b448 100644
--- a/src/diffusers/models/unet_2d_condition_flax.py
+++ b/src/diffusers/models/unet_2d_condition_flax.py
@@ -230,9 +230,9 @@ def __call__(
     ) -> Union[FlaxUNet2DConditionOutput, Tuple]:
         r"""
         Args:
-            sample (`jnp.ndarray`): (channel, height, width) noisy inputs tensor
+            sample (`jnp.ndarray`): (batch, channel, height, width) noisy inputs tensor
             timestep (`jnp.ndarray` or `float` or `int`): timesteps
-            encoder_hidden_states (`jnp.ndarray`): (channel, height, width) encoder hidden states
+            encoder_hidden_states (`jnp.ndarray`): (batch_size, sequence_length, hidden_size) encoder hidden states
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`models.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of a
                 plain tuple.

From afdd7bb635c3b88398acf8f81cd68d01a6381951 Mon Sep 17 00:00:00 2001
From: Dhruv Karan <k4r4n.dhruv@gmail.com>
Date: Wed, 16 Nov 2022 22:48:51 +0530
Subject: [PATCH 200/269] [Community Pipeline] CLIPSeg +
 StableDiffusionInpainting (#1250)

* text inpainting

* refactor
---
 examples/community/README.md          |  35 +++
 examples/community/text_inpainting.py | 320 ++++++++++++++++++++++++++
 2 files changed, 355 insertions(+)
 create mode 100644 examples/community/text_inpainting.py

diff --git a/examples/community/README.md b/examples/community/README.md
index 5535937dca7d..dc35d36a953d 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -20,6 +20,7 @@ If a community doesn't work as expected, please open an issue and ping the autho
 | Imagic Stable Diffusion | Stable Diffusion Pipeline that enables writing a text prompt to edit an existing image| [Imagic Stable Diffusion](#imagic-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Mark Rich](https://github.com/MarkRich) |
 | Multilingual Stable Diffusion| Stable Diffusion Pipeline that supports prompts in 50 different languages.                                                                                                                                                                                                                                                                                                     | [Multilingual Stable Diffusion](#multilingual-stable-diffusion-pipeline)                                                                 | -                                                                                                                                                                                                                  |                        [Juan Carlos Piñeros](https://github.com/juancopi81) |
 | Image to Image Inpainting Stable Diffusion | Stable Diffusion Pipeline that enables the overlaying of two images and subsequent inpainting| [Image to Image Inpainting Stable Diffusion](#image-to-image-inpainting-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Alex McKinney](https://github.com/vvvm23) |
+| Text Based Inpainting Stable Diffusion | Stable Diffusion Inpainting Pipeline that enables passing a text prompt to generate the mask for inpainting| [Text Based Inpainting Stable Diffusion](#image-to-image-inpainting-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Dhruv Karan](https://github.com/unography) |
 
 
 
@@ -618,3 +619,37 @@ pipe = pipe.to("cuda")
 prompt = "Your prompt here!"
 image = pipe(prompt=prompt, image=init_image, inner_image=inner_image, mask_image=mask_image).images[0]
 ```
+
+### Text Based Inpainting Stable Diffusion
+
+Use a text prompt to generate the mask for the area to be inpainted.
+Currently uses the CLIPSeg model for mask generation, then calls the standard Stable Diffusion Inpainting pipeline to perform the inpainting.
+
+```python
+from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation
+from diffusers import DiffusionPipeline
+
+from PIL import Image
+import requests
+from torch import autocast
+
+processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")
+
+pipe = DiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-inpainting",
+    custom_pipeline="text_inpainting",
+    segmentation_model=model,
+    segmentation_processor=processor
+)
+pipe = pipe.to("cuda")
+
+
+url = "https://github.com/timojl/clipseg/blob/master/example_image.jpg?raw=true"
+image = Image.open(requests.get(url, stream=True).raw).resize((512, 512))
+text = "a glass"  # will mask out this text
+prompt = "a cup"  # the masked out region will be replaced with this
+
+with autocast("cuda"):
+    image = pipe(image=image, text=text, prompt=prompt).images[0]
+```
\ No newline at end of file
diff --git a/examples/community/text_inpainting.py b/examples/community/text_inpainting.py
new file mode 100644
index 000000000000..38d5e963372d
--- /dev/null
+++ b/examples/community/text_inpainting.py
@@ -0,0 +1,320 @@
+from typing import Callable, List, Optional, Union
+
+import torch
+
+import PIL
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionInpaintPipeline
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import deprecate, is_accelerate_available, logging
+from transformers import (
+    CLIPFeatureExtractor,
+    CLIPSegForImageSegmentation,
+    CLIPSegProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class TextInpainting(DiffusionPipeline):
+    r"""
+    Pipeline for text based inpainting using Stable Diffusion.
+    Uses CLIPSeg to get a mask from the given text, then calls the Inpainting pipeline with the generated mask
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        segmentation_model ([`CLIPSegForImageSegmentation`]):
+            CLIPSeg Model to generate mask from the given text. Please refer to the [model card]() for details.
+        segmentation_processor ([`CLIPSegProcessor`]):
+            CLIPSeg processor to get image, text features to translate prompt to English, if necessary. Please refer to the
+            [model card](https://huggingface.co/docs/transformers/model_doc/clipseg) for details.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        segmentation_model: CLIPSegForImageSegmentation,
+        segmentation_processor: CLIPSegProcessor,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "skip_prk_steps") and scheduler.config.skip_prk_steps is False:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration"
+                " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make"
+                " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to"
+                " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face"
+                " Hub, it would be very nice if you could open a Pull request for the"
+                " `scheduler/scheduler_config.json` file"
+            )
+            deprecate("skip_prk_steps not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["skip_prk_steps"] = True
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warn(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            segmentation_model=segmentation_model,
+            segmentation_processor=segmentation_processor,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    def enable_sequential_cpu_offload(self):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device("cuda")
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    def enable_xformers_memory_efficient_attention(self):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(False)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.FloatTensor, PIL.Image.Image],
+        text: str,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            text (`str``):
+                The text to use to generate the mask.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # We use the input text to generate the mask
+        inputs = self.segmentation_processor(
+            text=[text], images=[image], padding="max_length", return_tensors="pt"
+        ).to(self.device)
+        outputs = self.segmentation_model(**inputs)
+        mask = torch.sigmoid(outputs.logits).cpu().detach().unsqueeze(-1).numpy()
+        mask_pil = self.numpy_to_pil(mask)[0].resize(image.size)
+
+        # Run inpainting pipeline with the generated mask
+        inpainting_pipeline = StableDiffusionInpaintPipeline(
+            vae=self.vae,
+            text_encoder=self.text_encoder,
+            tokenizer=self.tokenizer,
+            unet=self.unet,
+            scheduler=self.scheduler,
+            safety_checker=self.safety_checker,
+            feature_extractor=self.feature_extractor,
+        )
+        return inpainting_pipeline(
+            prompt=prompt,
+            image=image,
+            mask_image=mask_pil,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+        )

From 1138d63b519e37f0ce04e027b9f4a3261d27c628 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Wed, 16 Nov 2022 18:42:21 +0100
Subject: [PATCH 201/269] Temporary local test for PIL_INTERPOLATION (#1317)

* Temporary local test for PIL_INTERPOLATION

* Fix examples too.
---
 examples/community/imagic_stable_diffusion.py | 22 ++++++++++++++++++-
 examples/community/lpw_stable_diffusion.py    | 22 ++++++++++++++++++-
 .../community/lpw_stable_diffusion_onnx.py    | 21 +++++++++++++++++-
 .../textual_inversion/textual_inversion.py    | 22 ++++++++++++++++++-
 .../textual_inversion_flax.py                 | 22 ++++++++++++++++++-
 5 files changed, 104 insertions(+), 5 deletions(-)

diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py
index d6d89283b194..4aee953169cd 100644
--- a/examples/community/imagic_stable_diffusion.py
+++ b/examples/community/imagic_stable_diffusion.py
@@ -17,11 +17,31 @@
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from diffusers.utils import PIL_INTERPOLATION, logging
+from diffusers.utils import logging
 from tqdm.auto import tqdm
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 
+# TODO: remove and import from diffusers.utils when the new version of diffusers is released
+from packaging import version
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+# ------------------------------------------------------------------------------
+
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py
index 8c5f5b46a768..5f0c6b706f78 100644
--- a/examples/community/lpw_stable_diffusion.py
+++ b/examples/community/lpw_stable_diffusion.py
@@ -12,9 +12,29 @@
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from diffusers.utils import PIL_INTERPOLATION, deprecate, is_accelerate_available, logging
+from diffusers.utils import deprecate, is_accelerate_available, logging
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
+# TODO: remove and import from diffusers.utils when the new version of diffusers is released
+from packaging import version
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+# ------------------------------------------------------------------------------
+
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
diff --git a/examples/community/lpw_stable_diffusion_onnx.py b/examples/community/lpw_stable_diffusion_onnx.py
index 268af775a3cd..7c69951872e2 100644
--- a/examples/community/lpw_stable_diffusion_onnx.py
+++ b/examples/community/lpw_stable_diffusion_onnx.py
@@ -10,9 +10,28 @@
 from diffusers.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from diffusers.utils import PIL_INTERPOLATION, logging
+from diffusers.utils import logging
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 
+# TODO: remove and import from diffusers.utils when the new version of diffusers is released
+from packaging import version
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+# ------------------------------------------------------------------------------
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py
index 532ce4a74139..a40caec2f2e8 100644
--- a/examples/textual_inversion/textual_inversion.py
+++ b/examples/textual_inversion/textual_inversion.py
@@ -18,13 +18,33 @@
 from diffusers import AutoencoderKL, DDPMScheduler, PNDMScheduler, StableDiffusionPipeline, UNet2DConditionModel
 from diffusers.optimization import get_scheduler
 from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from diffusers.utils import PIL_INTERPOLATION
 from huggingface_hub import HfFolder, Repository, whoami
 from PIL import Image
 from torchvision import transforms
 from tqdm.auto import tqdm
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
+# TODO: remove and import from diffusers.utils when the new version of diffusers is released
+from packaging import version
+import PIL
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+# ------------------------------------------------------------------------------
+
 
 logger = get_logger(__name__)
 
diff --git a/examples/textual_inversion/textual_inversion_flax.py b/examples/textual_inversion/textual_inversion_flax.py
index 008fe812c92f..eaa0c6910d40 100644
--- a/examples/textual_inversion/textual_inversion_flax.py
+++ b/examples/textual_inversion/textual_inversion_flax.py
@@ -23,7 +23,6 @@
     FlaxUNet2DConditionModel,
 )
 from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionSafetyChecker
-from diffusers.utils import PIL_INTERPOLATION
 from flax import jax_utils
 from flax.training import train_state
 from flax.training.common_utils import shard
@@ -34,6 +33,27 @@
 from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel, set_seed
 
 
+# TODO: remove and import from diffusers.utils when the new version of diffusers is released
+from packaging import version
+import PIL
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+# ------------------------------------------------------------------------------
+
 logger = logging.getLogger(__name__)
 
 

From 245e9cc7fffed3e1830fcc74637c1581cb8f46b6 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 17 Nov 2022 15:03:31 +0100
Subject: [PATCH 202/269] fix make style

---
 examples/community/imagic_stable_diffusion.py        | 5 +++--
 examples/community/lpw_stable_diffusion.py           | 4 +++-
 examples/community/lpw_stable_diffusion_onnx.py      | 4 +++-
 examples/textual_inversion/textual_inversion.py      | 8 +++++---
 examples/textual_inversion/textual_inversion_flax.py | 7 ++++---
 5 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py
index 4aee953169cd..65966b4830e8 100644
--- a/examples/community/imagic_stable_diffusion.py
+++ b/examples/community/imagic_stable_diffusion.py
@@ -18,12 +18,13 @@
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
 from diffusers.utils import logging
+
+# TODO: remove and import from diffusers.utils when the new version of diffusers is released
+from packaging import version
 from tqdm.auto import tqdm
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 
-# TODO: remove and import from diffusers.utils when the new version of diffusers is released
-from packaging import version
 if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
     PIL_INTERPOLATION = {
         "linear": PIL.Image.Resampling.BILINEAR,
diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py
index 5f0c6b706f78..b952ffe76df1 100644
--- a/examples/community/lpw_stable_diffusion.py
+++ b/examples/community/lpw_stable_diffusion.py
@@ -13,10 +13,12 @@
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
 from diffusers.utils import deprecate, is_accelerate_available, logging
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 # TODO: remove and import from diffusers.utils when the new version of diffusers is released
 from packaging import version
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+
 if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
     PIL_INTERPOLATION = {
         "linear": PIL.Image.Resampling.BILINEAR,
diff --git a/examples/community/lpw_stable_diffusion_onnx.py b/examples/community/lpw_stable_diffusion_onnx.py
index 7c69951872e2..577772b9c36a 100644
--- a/examples/community/lpw_stable_diffusion_onnx.py
+++ b/examples/community/lpw_stable_diffusion_onnx.py
@@ -11,10 +11,12 @@
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
 from diffusers.utils import logging
-from transformers import CLIPFeatureExtractor, CLIPTokenizer
 
 # TODO: remove and import from diffusers.utils when the new version of diffusers is released
 from packaging import version
+from transformers import CLIPFeatureExtractor, CLIPTokenizer
+
+
 if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
     PIL_INTERPOLATION = {
         "linear": PIL.Image.Resampling.BILINEAR,
diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py
index a40caec2f2e8..380ce90297fb 100644
--- a/examples/textual_inversion/textual_inversion.py
+++ b/examples/textual_inversion/textual_inversion.py
@@ -12,6 +12,7 @@
 import torch.utils.checkpoint
 from torch.utils.data import Dataset
 
+import PIL
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import set_seed
@@ -19,14 +20,15 @@
 from diffusers.optimization import get_scheduler
 from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
 from huggingface_hub import HfFolder, Repository, whoami
+
+# TODO: remove and import from diffusers.utils when the new version of diffusers is released
+from packaging import version
 from PIL import Image
 from torchvision import transforms
 from tqdm.auto import tqdm
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
-# TODO: remove and import from diffusers.utils when the new version of diffusers is released
-from packaging import version
-import PIL
+
 if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
     PIL_INTERPOLATION = {
         "linear": PIL.Image.Resampling.BILINEAR,
diff --git a/examples/textual_inversion/textual_inversion_flax.py b/examples/textual_inversion/textual_inversion_flax.py
index eaa0c6910d40..6406be8ad698 100644
--- a/examples/textual_inversion/textual_inversion_flax.py
+++ b/examples/textual_inversion/textual_inversion_flax.py
@@ -14,6 +14,7 @@
 import jax
 import jax.numpy as jnp
 import optax
+import PIL
 import transformers
 from diffusers import (
     FlaxAutoencoderKL,
@@ -27,15 +28,15 @@
 from flax.training import train_state
 from flax.training.common_utils import shard
 from huggingface_hub import HfFolder, Repository, whoami
+
+# TODO: remove and import from diffusers.utils when the new version of diffusers is released
+from packaging import version
 from PIL import Image
 from torchvision import transforms
 from tqdm.auto import tqdm
 from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel, set_seed
 
 
-# TODO: remove and import from diffusers.utils when the new version of diffusers is released
-from packaging import version
-import PIL
 if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
     PIL_INTERPOLATION = {
         "linear": PIL.Image.Resampling.BILINEAR,

From b3911f89a30a12ed34e993b090a748d4a8f886bb Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 17 Nov 2022 15:06:23 +0100
Subject: [PATCH 203/269] make fix copies

---
 .../pipelines/alt_diffusion/pipeline_alt_diffusion.py         | 4 ++--
 .../pipelines/stable_diffusion/pipeline_cycle_diffusion.py    | 2 +-
 .../stable_diffusion/pipeline_stable_diffusion_img2img.py     | 2 +-
 .../stable_diffusion/pipeline_stable_diffusion_inpaint.py     | 2 +-
 .../pipeline_stable_diffusion_inpaint_legacy.py               | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index 01b2051db410..afb2c5288640 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -179,7 +179,7 @@ def disable_attention_slicing(self):
         # set slice_size = `None` to disable `attention slicing`
         self.enable_attention_slicing(None)
 
-    def enable_sequential_cpu_offload(self):
+    def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
         text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
@@ -190,7 +190,7 @@ def enable_sequential_cpu_offload(self):
         else:
             raise ImportError("Please install accelerate via `pip install accelerate`")
 
-        device = torch.device("cuda")
+        device = torch.device(f"cuda:{gpu_id}")
 
         for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
             if cpu_offloaded_model is not None:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index b5f409929234..f3fc3f8eb514 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -220,7 +220,7 @@ def enable_sequential_cpu_offload(self):
         else:
             raise ImportError("Please install accelerate via `pip install accelerate`")
 
-        device = torch.device("cuda")
+        device = torch.device(f"cuda:{gpu_id}")
 
         for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
             if cpu_offloaded_model is not None:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 4bfbc5fbcb6e..d9ec997b91f3 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -187,7 +187,7 @@ def enable_sequential_cpu_offload(self):
         else:
             raise ImportError("Please install accelerate via `pip install accelerate`")
 
-        device = torch.device("cuda")
+        device = torch.device(f"cuda:{gpu_id}")
 
         for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
             if cpu_offloaded_model is not None:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index fea2b3e5a8ee..aa087bb09ff2 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -180,7 +180,7 @@ def enable_sequential_cpu_offload(self):
         else:
             raise ImportError("Please install accelerate via `pip install accelerate`")
 
-        device = torch.device("cuda")
+        device = torch.device(f"cuda:{gpu_id}")
 
         for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
             if cpu_offloaded_model is not None:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 5c2a3e952375..2cc7c5c167e4 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -200,7 +200,7 @@ def enable_sequential_cpu_offload(self):
         else:
             raise ImportError("Please install accelerate via `pip install accelerate`")
 
-        device = torch.device("cuda")
+        device = torch.device(f"cuda:{gpu_id}")
 
         for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
             if cpu_offloaded_model is not None:

From 61719bf26c3f09f2fd8483d83778f12e151d2ed4 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Thu, 17 Nov 2022 15:41:33 +0100
Subject: [PATCH 204/269] Fix gpu_id (#1326)

---
 .../pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py | 4 ++--
 .../pipelines/stable_diffusion/pipeline_cycle_diffusion.py    | 2 +-
 .../stable_diffusion/pipeline_stable_diffusion_img2img.py     | 2 +-
 .../stable_diffusion/pipeline_stable_diffusion_inpaint.py     | 2 +-
 .../pipeline_stable_diffusion_inpaint_legacy.py               | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 294a43e86e61..fc530603ee32 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -178,7 +178,7 @@ def disable_attention_slicing(self):
         self.enable_attention_slicing(None)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline.enable_sequential_cpu_offload
-    def enable_sequential_cpu_offload(self):
+    def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
         text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
@@ -189,7 +189,7 @@ def enable_sequential_cpu_offload(self):
         else:
             raise ImportError("Please install accelerate via `pip install accelerate`")
 
-        device = torch.device("cuda")
+        device = torch.device(f"cuda:{gpu_id}")
 
         for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
             if cpu_offloaded_model is not None:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index f3fc3f8eb514..2b3cf8fa95d5 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -209,7 +209,7 @@ def disable_attention_slicing(self):
         self.enable_attention_slicing(None)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
-    def enable_sequential_cpu_offload(self):
+    def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
         text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index d9ec997b91f3..f543d564fe84 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -176,7 +176,7 @@ def disable_attention_slicing(self):
         self.enable_attention_slicing(None)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
-    def enable_sequential_cpu_offload(self):
+    def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
         text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index aa087bb09ff2..e85e23869987 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -169,7 +169,7 @@ def disable_attention_slicing(self):
         self.enable_attention_slicing(None)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
-    def enable_sequential_cpu_offload(self):
+    def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
         text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 2cc7c5c167e4..77e903ff6807 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -189,7 +189,7 @@ def disable_attention_slicing(self):
         self.enable_attention_slicing(None)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
-    def enable_sequential_cpu_offload(self):
+    def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
         text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a

From 3346ec3acd62363ddbd924b09601fbd3897473f9 Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathikr@usc.edu>
Date: Thu, 17 Nov 2022 06:48:41 -0800
Subject: [PATCH 205/269] integrate ort (#1110)

* integrate ort

* use return_dict=False

* revert unet return value change

* revert unet return value change

* add note to readme

* adjust readme

* add contact

* `make style`

Co-authored-by: Prathik Rao <prathikrao@microsoft.com>
Co-authored-by: Anton Lozhkov <anton@huggingface.co>
---
 .../unconditional_image_generation/README.md  |  21 ++
 .../train_unconditional_ort.py                | 251 ++++++++++++++++++
 src/diffusers/pipeline_utils.py               |   3 +
 3 files changed, 275 insertions(+)
 create mode 100644 examples/unconditional_image_generation/train_unconditional_ort.py

diff --git a/examples/unconditional_image_generation/README.md b/examples/unconditional_image_generation/README.md
index e9c461b482de..dbb849178982 100644
--- a/examples/unconditional_image_generation/README.md
+++ b/examples/unconditional_image_generation/README.md
@@ -127,3 +127,24 @@ dataset.push_to_hub("name_of_your_dataset", private=True)
 and that's it! You can now train your model by simply setting the `--dataset_name` argument to the name of your dataset on the hub.
 
 More on this can also be found in [this blog post](https://huggingface.co/blog/image-search-datasets).
+
+#### Use ONNXRuntime to accelerate training
+
+In order to leverage onnxruntime to accelerate training, please use train_unconditional_ort.py
+
+The command to train a DDPM UNet model on the Oxford Flowers dataset with onnxruntime:
+
+```bash
+accelerate launch train_unconditional_ort.py \
+  --dataset_name="huggan/flowers-102-categories" \
+  --resolution=64 \
+  --output_dir="ddpm-ema-flowers-64" \
+  --train_batch_size=16 \
+  --num_epochs=1 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=1e-4 \
+  --lr_warmup_steps=500 \
+  --mixed_precision=fp16
+  ```
+
+Please contact Prathik Rao (prathikr), Sunghoon Choi (hanbitmyths), Ashwini Khade (askhade), or Peng Wang (pengwa) on github with any questions.
\ No newline at end of file
diff --git a/examples/unconditional_image_generation/train_unconditional_ort.py b/examples/unconditional_image_generation/train_unconditional_ort.py
new file mode 100644
index 000000000000..8259c835fc3a
--- /dev/null
+++ b/examples/unconditional_image_generation/train_unconditional_ort.py
@@ -0,0 +1,251 @@
+import argparse
+import math
+import os
+
+import torch
+import torch.nn.functional as F
+
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from datasets import load_dataset
+from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
+from diffusers.hub_utils import init_git_repo, push_to_hub
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel
+from onnxruntime.training.ortmodule import ORTModule
+from torchvision.transforms import (
+    CenterCrop,
+    Compose,
+    InterpolationMode,
+    Normalize,
+    RandomHorizontalFlip,
+    Resize,
+    ToTensor,
+)
+from tqdm.auto import tqdm
+
+
+logger = get_logger(__name__)
+
+
+def main(args):
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with="tensorboard",
+        logging_dir=logging_dir,
+    )
+
+    model = UNet2DModel(
+        sample_size=args.resolution,
+        in_channels=3,
+        out_channels=3,
+        layers_per_block=2,
+        block_out_channels=(128, 128, 256, 256, 512, 512),
+        down_block_types=(
+            "DownBlock2D",
+            "DownBlock2D",
+            "DownBlock2D",
+            "DownBlock2D",
+            "AttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        up_block_types=(
+            "UpBlock2D",
+            "AttnUpBlock2D",
+            "UpBlock2D",
+            "UpBlock2D",
+            "UpBlock2D",
+            "UpBlock2D",
+        ),
+    )
+    model = ORTModule(model)
+    noise_scheduler = DDPMScheduler(num_train_timesteps=1000, tensor_format="pt")
+    optimizer = torch.optim.AdamW(
+        model.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    augmentations = Compose(
+        [
+            Resize(args.resolution, interpolation=InterpolationMode.BILINEAR),
+            CenterCrop(args.resolution),
+            RandomHorizontalFlip(),
+            ToTensor(),
+            Normalize([0.5], [0.5]),
+        ]
+    )
+
+    if args.dataset_name is not None:
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+            use_auth_token=True if args.use_auth_token else None,
+            split="train",
+        )
+    else:
+        dataset = load_dataset("imagefolder", data_dir=args.train_data_dir, cache_dir=args.cache_dir, split="train")
+
+    def transforms(examples):
+        images = [augmentations(image.convert("RGB")) for image in examples["image"]]
+        return {"input": images}
+
+    dataset.set_transform(transforms)
+    train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.train_batch_size, shuffle=True)
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps,
+        num_training_steps=(len(train_dataloader) * args.num_epochs) // args.gradient_accumulation_steps,
+    )
+
+    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, lr_scheduler
+    )
+
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+
+    ema_model = EMAModel(model, inv_gamma=args.ema_inv_gamma, power=args.ema_power, max_value=args.ema_max_decay)
+
+    if args.push_to_hub:
+        repo = init_git_repo(args, at_init=True)
+
+    if accelerator.is_main_process:
+        run = os.path.split(__file__)[-1].split(".")[0]
+        accelerator.init_trackers(run)
+
+    global_step = 0
+    for epoch in range(args.num_epochs):
+        model.train()
+        progress_bar = tqdm(total=num_update_steps_per_epoch, disable=not accelerator.is_local_main_process)
+        progress_bar.set_description(f"Epoch {epoch}")
+        for step, batch in enumerate(train_dataloader):
+            clean_images = batch["input"]
+            # Sample noise that we'll add to the images
+            noise = torch.randn(clean_images.shape).to(clean_images.device)
+            bsz = clean_images.shape[0]
+            # Sample a random timestep for each image
+            timesteps = torch.randint(
+                0, noise_scheduler.config.num_train_timesteps, (bsz,), device=clean_images.device
+            ).long()
+
+            # Add noise to the clean images according to the noise magnitude at each timestep
+            # (this is the forward diffusion process)
+            noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)
+
+            with accelerator.accumulate(model):
+                # Predict the noise residual
+                noise_pred = model(noisy_images, timesteps, return_dict=True)[0]
+                loss = F.mse_loss(noise_pred, noise)
+                accelerator.backward(loss)
+
+                accelerator.clip_grad_norm_(model.parameters(), 1.0)
+                optimizer.step()
+                lr_scheduler.step()
+                if args.use_ema:
+                    ema_model.step(model)
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step}
+            if args.use_ema:
+                logs["ema_decay"] = ema_model.decay
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+        progress_bar.close()
+
+        accelerator.wait_for_everyone()
+
+        # Generate sample images for visual inspection
+        if accelerator.is_main_process:
+            if epoch % args.save_images_epochs == 0 or epoch == args.num_epochs - 1:
+                pipeline = DDPMPipeline(
+                    unet=accelerator.unwrap_model(ema_model.averaged_model if args.use_ema else model),
+                    scheduler=noise_scheduler,
+                )
+
+                generator = torch.manual_seed(0)
+                # run pipeline in inference (sample random noise and denoise)
+                images = pipeline(generator=generator, batch_size=args.eval_batch_size, output_type="numpy").images
+
+                # denormalize the images and save to tensorboard
+                images_processed = (images * 255).round().astype("uint8")
+                accelerator.trackers[0].writer.add_images(
+                    "test_samples", images_processed.transpose(0, 3, 1, 2), epoch
+                )
+
+            if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
+                # save the model
+                if args.push_to_hub:
+                    push_to_hub(args, pipeline, repo, commit_message=f"Epoch {epoch}", blocking=False)
+                else:
+                    pipeline.save_pretrained(args.output_dir)
+        accelerator.wait_for_everyone()
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument("--local_rank", type=int, default=-1)
+    parser.add_argument("--dataset_name", type=str, default=None)
+    parser.add_argument("--dataset_config_name", type=str, default=None)
+    parser.add_argument("--train_data_dir", type=str, default=None, help="A folder containing the training data.")
+    parser.add_argument("--output_dir", type=str, default="ddpm-model-64")
+    parser.add_argument("--overwrite_output_dir", action="store_true")
+    parser.add_argument("--cache_dir", type=str, default=None)
+    parser.add_argument("--resolution", type=int, default=64)
+    parser.add_argument("--train_batch_size", type=int, default=16)
+    parser.add_argument("--eval_batch_size", type=int, default=16)
+    parser.add_argument("--num_epochs", type=int, default=100)
+    parser.add_argument("--save_images_epochs", type=int, default=10)
+    parser.add_argument("--save_model_epochs", type=int, default=10)
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
+    parser.add_argument("--learning_rate", type=float, default=1e-4)
+    parser.add_argument("--lr_scheduler", type=str, default="cosine")
+    parser.add_argument("--lr_warmup_steps", type=int, default=500)
+    parser.add_argument("--adam_beta1", type=float, default=0.95)
+    parser.add_argument("--adam_beta2", type=float, default=0.999)
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-6)
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08)
+    parser.add_argument("--use_ema", action="store_true", default=True)
+    parser.add_argument("--ema_inv_gamma", type=float, default=1.0)
+    parser.add_argument("--ema_power", type=float, default=3 / 4)
+    parser.add_argument("--ema_max_decay", type=float, default=0.9999)
+    parser.add_argument("--push_to_hub", action="store_true")
+    parser.add_argument("--use_auth_token", action="store_true")
+    parser.add_argument("--hub_token", type=str, default=None)
+    parser.add_argument("--hub_model_id", type=str, default=None)
+    parser.add_argument("--hub_private_repo", action="store_true")
+    parser.add_argument("--logging_dir", type=str, default="logs")
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("You must specify either a dataset name from the hub or a train data directory.")
+
+    main(args)
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 4ab1695683a7..2b7834c207cf 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -78,6 +78,9 @@
         "ProcessorMixin": ["save_pretrained", "from_pretrained"],
         "ImageProcessingMixin": ["save_pretrained", "from_pretrained"],
     },
+    "onnxruntime.training": {
+        "ORTModule": ["save_pretrained", "from_pretrained"],
+    },
 }
 
 ALL_IMPORTABLE_CLASSES = {}

From 2dd12e38afffeeab8f49033e06a4cb6438d6eb49 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 17 Nov 2022 15:50:33 +0100
Subject: [PATCH 206/269] make fix copies again

---
 .../pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 294a43e86e61..4f5c1c354626 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -189,7 +189,7 @@ def enable_sequential_cpu_offload(self):
         else:
             raise ImportError("Please install accelerate via `pip install accelerate`")
 
-        device = torch.device("cuda")
+        device = torch.device(f"cuda:{gpu_id}")
 
         for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
             if cpu_offloaded_model is not None:

From 632dacea2fb686e6dfbea6b74ee0256ed3401cc1 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 17 Nov 2022 16:00:26 +0100
Subject: [PATCH 207/269] [Custom pipeline] Easier loading of local pipelines
 (#1327)

* [Custom pipeline] Easier loading of local pipelines

* upgrade black
---
 src/diffusers/pipeline_utils.py             |  11 ++-
 tests/fixtures/custom_pipeline/what_ever.py | 101 ++++++++++++++++++++
 tests/test_pipelines.py                     |  16 +++-
 3 files changed, 126 insertions(+), 2 deletions(-)
 create mode 100644 tests/fixtures/custom_pipeline/what_ever.py

diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 2b7834c207cf..65432f47f927 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -18,6 +18,7 @@
 import inspect
 import os
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
@@ -483,8 +484,16 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         # 2. Load the pipeline class, if using custom module then load it from the hub
         # if we load from explicit class, let's use it
         if custom_pipeline is not None:
+            if custom_pipeline.endswith(".py"):
+                path = Path(custom_pipeline)
+                # decompose into folder & file
+                file_name = path.name
+                custom_pipeline = path.parent.absolute()
+            else:
+                file_name = CUSTOM_PIPELINE_FILE_NAME
+
             pipeline_class = get_class_from_dynamic_module(
-                custom_pipeline, module_file=CUSTOM_PIPELINE_FILE_NAME, cache_dir=custom_pipeline
+                custom_pipeline, module_file=file_name, cache_dir=custom_pipeline
             )
         elif cls != DiffusionPipeline:
             pipeline_class = cls
diff --git a/tests/fixtures/custom_pipeline/what_ever.py b/tests/fixtures/custom_pipeline/what_ever.py
new file mode 100644
index 000000000000..e7429d0a1945
--- /dev/null
+++ b/tests/fixtures/custom_pipeline/what_ever.py
@@ -0,0 +1,101 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+# limitations under the License.
+
+
+from typing import Optional, Tuple, Union
+
+import torch
+
+from diffusers.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+class CustomLocalPipeline(DiffusionPipeline):
+    r"""
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Parameters:
+        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
+            [`DDPMScheduler`], or [`DDIMScheduler`].
+    """
+
+    def __init__(self, unet, scheduler):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        generator: Optional[torch.Generator] = None,
+        num_inference_steps: int = 50,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images to generate.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            eta (`float`, *optional*, defaults to 0.0):
+                The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM).
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+
+        # Sample gaussian noise to begin loop
+        image = torch.randn(
+            (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
+            generator=generator,
+        )
+        image = image.to(self.device)
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            # 1. predict noise model_output
+            model_output = self.unet(image, t).sample
+
+            # 2. predict previous mean of image x_t-1 and add variance depending on eta
+            # eta corresponds to η in paper and should be between [0, 1]
+            # do x_t -> x_t-1
+            image = self.scheduler.step(model_output, t, image).prev_sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,), "This is a local test"
+
+        return ImagePipelineOutput(images=image), "This is a local test"
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index c77b000292a5..b593791c9455 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -192,7 +192,7 @@ def test_run_custom_pipeline(self):
         # compare output to https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py#L102
         assert output_str == "This is a test"
 
-    def test_local_custom_pipeline(self):
+    def test_local_custom_pipeline_repo(self):
         local_custom_pipeline_path = get_tests_dir("fixtures/custom_pipeline")
         pipeline = DiffusionPipeline.from_pretrained(
             "google/ddpm-cifar10-32", custom_pipeline=local_custom_pipeline_path
@@ -205,6 +205,20 @@ def test_local_custom_pipeline(self):
         # compare to https://github.com/huggingface/diffusers/blob/main/tests/fixtures/custom_pipeline/pipeline.py#L102
         assert output_str == "This is a local test"
 
+    def test_local_custom_pipeline_file(self):
+        local_custom_pipeline_path = get_tests_dir("fixtures/custom_pipeline")
+        local_custom_pipeline_path = os.path.join(local_custom_pipeline_path, "what_ever.py")
+        pipeline = DiffusionPipeline.from_pretrained(
+            "google/ddpm-cifar10-32", custom_pipeline=local_custom_pipeline_path
+        )
+        pipeline = pipeline.to(torch_device)
+        images, output_str = pipeline(num_inference_steps=2, output_type="np")
+
+        assert pipeline.__class__.__name__ == "CustomLocalPipeline"
+        assert images[0].shape == (1, 32, 32, 3)
+        # compare to https://github.com/huggingface/diffusers/blob/main/tests/fixtures/custom_pipeline/pipeline.py#L102
+        assert output_str == "This is a local test"
+
     @slow
     @require_torch_gpu
     def test_load_pipeline_from_git(self):

From e05ca84f4134c240e580e6964ad9b3fa9591a316 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Thu, 17 Nov 2022 16:37:35 +0100
Subject: [PATCH 208/269] [ONNX] Support Euler schedulers (#1328)

---
 .../stable_diffusion/pipeline_onnx_stable_diffusion.py      | 6 ++++--
 .../pipeline_onnx_stable_diffusion_img2img.py               | 6 ++++--
 .../pipeline_onnx_stable_diffusion_inpaint.py               | 6 ++++--
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
index eceefea87426..9830ace6a167 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
@@ -261,8 +261,10 @@ def __call__(
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, torch.from_numpy(latents), **extra_step_kwargs).prev_sample
-            latents = np.array(latents)
+            scheduler_output = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
+            )
+            latents = scheduler_output.prev_sample.numpy()
 
             # call the callback, if provided
             if callback is not None and i % callback_steps == 0:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
index 8b4f78c49787..1fc4786e4792 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -401,8 +401,10 @@ def __call__(
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, torch.from_numpy(latents), **extra_step_kwargs).prev_sample
-            latents = latents.numpy()
+            scheduler_output = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
+            )
+            latents = scheduler_output.prev_sample.numpy()
 
             # call the callback, if provided
             if callback is not None and i % callback_steps == 0:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
index 6228824b3d24..ede30b556334 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -424,8 +424,10 @@ def __call__(
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
 
             # compute the previous noisy sample x_t -> x_t-1
-            latents = self.scheduler.step(noise_pred, t, torch.from_numpy(latents), **extra_step_kwargs).prev_sample
-            latents = latents.numpy()
+            scheduler_output = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
+            )
+            latents = scheduler_output.prev_sample.numpy()
 
             # call the callback, if provided
             if callback is not None and i % callback_steps == 0:

From 63b34191b99bebd65fda69eb32d4d9a8872b9e75 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 17 Nov 2022 16:47:19 +0100
Subject: [PATCH 209/269] Fix typo

---
 docs/source/api/pipelines/alt_diffusion.mdx    | 6 +++---
 docs/source/api/pipelines/stable_diffusion.mdx | 8 ++++----
 src/diffusers/pipeline_utils.py                | 6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/source/api/pipelines/alt_diffusion.mdx b/docs/source/api/pipelines/alt_diffusion.mdx
index efa9beb8c072..84dda88dcb34 100644
--- a/docs/source/api/pipelines/alt_diffusion.mdx
+++ b/docs/source/api/pipelines/alt_diffusion.mdx
@@ -61,10 +61,10 @@ If you want to use all possible use cases in a single `DiffusionPipeline` we rec
 ...     AltDiffusionImg2ImgPipeline,
 ... )
 
->>> img2text = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion")
->>> img2img = AltDiffusionImg2ImgPipeline(**img2text.components)
+>>> text2img = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion")
+>>> img2img = AltDiffusionImg2ImgPipeline(**text2img.components)
 
->>> # now you can use img2text(...) and img2img(...) just like the call methods of each respective pipeline
+>>> # now you can use text2img(...) and img2img(...) just like the call methods of each respective pipeline
 ```
 
 ## AltDiffusionPipelineOutput
diff --git a/docs/source/api/pipelines/stable_diffusion.mdx b/docs/source/api/pipelines/stable_diffusion.mdx
index 1d22024a5322..8b551f7a3b17 100644
--- a/docs/source/api/pipelines/stable_diffusion.mdx
+++ b/docs/source/api/pipelines/stable_diffusion.mdx
@@ -61,11 +61,11 @@ If you want to use all possible use cases in a single `DiffusionPipeline` you ca
 ...     StableDiffusionInpaintPipeline,
 ... )
 
->>> img2text = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
->>> img2img = StableDiffusionImg2ImgPipeline(**img2text.components)
->>> inpaint = StableDiffusionInpaintPipeline(**img2text.components)
+>>> text2img = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+>>> img2img = StableDiffusionImg2ImgPipeline(**text2img.components)
+>>> inpaint = StableDiffusionInpaintPipeline(**text2img.components)
 
->>> # now you can use img2text(...), img2img(...), inpaint(...) just like the call methods of each respective pipeline
+>>> # now you can use text2img(...), img2img(...), inpaint(...) just like the call methods of each respective pipeline
 ```
 
 ## StableDiffusionPipelineOutput
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 65432f47f927..cf2bbb980ed0 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -680,9 +680,9 @@ def components(self) -> Dict[str, Any]:
         ...     StableDiffusionInpaintPipeline,
         ... )
 
-        >>> img2text = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        >>> img2img = StableDiffusionImg2ImgPipeline(**img2text.components)
-        >>> inpaint = StableDiffusionInpaintPipeline(**img2text.components)
+        >>> text2img = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> img2img = StableDiffusionImg2ImgPipeline(**text2img.components)
+        >>> inpaint = StableDiffusionInpaintPipeline(**text2img.components)
         ```
 
         Returns:

From b9b7039f0e326f57be233cdcbcf4cda325100649 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 17 Nov 2022 16:48:15 +0100
Subject: [PATCH 210/269] img2text Typo (#1329)

* make fix copies again

* Fix typo
---
 docs/source/api/pipelines/alt_diffusion.mdx    | 6 +++---
 docs/source/api/pipelines/stable_diffusion.mdx | 8 ++++----
 src/diffusers/pipeline_utils.py                | 6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/source/api/pipelines/alt_diffusion.mdx b/docs/source/api/pipelines/alt_diffusion.mdx
index efa9beb8c072..84dda88dcb34 100644
--- a/docs/source/api/pipelines/alt_diffusion.mdx
+++ b/docs/source/api/pipelines/alt_diffusion.mdx
@@ -61,10 +61,10 @@ If you want to use all possible use cases in a single `DiffusionPipeline` we rec
 ...     AltDiffusionImg2ImgPipeline,
 ... )
 
->>> img2text = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion")
->>> img2img = AltDiffusionImg2ImgPipeline(**img2text.components)
+>>> text2img = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion")
+>>> img2img = AltDiffusionImg2ImgPipeline(**text2img.components)
 
->>> # now you can use img2text(...) and img2img(...) just like the call methods of each respective pipeline
+>>> # now you can use text2img(...) and img2img(...) just like the call methods of each respective pipeline
 ```
 
 ## AltDiffusionPipelineOutput
diff --git a/docs/source/api/pipelines/stable_diffusion.mdx b/docs/source/api/pipelines/stable_diffusion.mdx
index 1d22024a5322..8b551f7a3b17 100644
--- a/docs/source/api/pipelines/stable_diffusion.mdx
+++ b/docs/source/api/pipelines/stable_diffusion.mdx
@@ -61,11 +61,11 @@ If you want to use all possible use cases in a single `DiffusionPipeline` you ca
 ...     StableDiffusionInpaintPipeline,
 ... )
 
->>> img2text = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
->>> img2img = StableDiffusionImg2ImgPipeline(**img2text.components)
->>> inpaint = StableDiffusionInpaintPipeline(**img2text.components)
+>>> text2img = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+>>> img2img = StableDiffusionImg2ImgPipeline(**text2img.components)
+>>> inpaint = StableDiffusionInpaintPipeline(**text2img.components)
 
->>> # now you can use img2text(...), img2img(...), inpaint(...) just like the call methods of each respective pipeline
+>>> # now you can use text2img(...), img2img(...), inpaint(...) just like the call methods of each respective pipeline
 ```
 
 ## StableDiffusionPipelineOutput
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 65432f47f927..cf2bbb980ed0 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -680,9 +680,9 @@ def components(self) -> Dict[str, Any]:
         ...     StableDiffusionInpaintPipeline,
         ... )
 
-        >>> img2text = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        >>> img2img = StableDiffusionImg2ImgPipeline(**img2text.components)
-        >>> inpaint = StableDiffusionInpaintPipeline(**img2text.components)
+        >>> text2img = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> img2img = StableDiffusionImg2ImgPipeline(**text2img.components)
+        >>> inpaint = StableDiffusionInpaintPipeline(**text2img.components)
         ```
 
         Returns:

From 0cfbb51b0c36f541697e5ef83296bda874ac0671 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathan@huggingface.co>
Date: Thu, 17 Nov 2022 10:25:49 -0800
Subject: [PATCH 211/269] add docs for multi-modal examples (#1227)

* add docs for multi-modal

* many changes

* fix docs build

* fix links

* Update docs/source/using-diffusers/other-modalities.mdx

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 README.md                                     |  9 +++++++--
 docs/source/_toctree.yml                      | 12 +++++++++++
 docs/source/api/experimental/rl.mdx           | 15 ++++++++++++++
 docs/source/using-diffusers/audio.mdx         | 16 +++++++++++++++
 .../using-diffusers/other-modalities.mdx      | 20 +++++++++++++++++++
 docs/source/using-diffusers/rl.mdx            | 18 +++++++++++++++++
 6 files changed, 88 insertions(+), 2 deletions(-)
 create mode 100644 docs/source/api/experimental/rl.mdx
 create mode 100644 docs/source/using-diffusers/audio.mdx
 create mode 100644 docs/source/using-diffusers/other-modalities.mdx
 create mode 100644 docs/source/using-diffusers/rl.mdx

diff --git a/README.md b/README.md
index 4a944d04590e..ff523d060c59 100644
--- a/README.md
+++ b/README.md
@@ -345,7 +345,8 @@ Textual Inversion is a technique for capturing novel concepts from a small numbe
 
 ## Stable Diffusion Community Pipelines
 
-The release of Stable Diffusion as an open source model has fostered a lot of interesting ideas and experimentation. Our [Community Examples folder](https://github.com/huggingface/diffusers/tree/main/examples/community) contains many ideas worth exploring, like interpolating to create animated videos, using CLIP Guidance for additional prompt fidelity, term weighting, and much more! [Take a look](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview) and [contribute your own](https://huggingface.co/docs/diffusers/using-diffusers/contribute_pipeline).
+The release of Stable Diffusion as an open source model has fostered a lot of interesting ideas and experimentation. 
+Our [Community Examples folder](https://github.com/huggingface/diffusers/tree/main/examples/community) contains many ideas worth exploring, like interpolating to create animated videos, using CLIP Guidance for additional prompt fidelity, term weighting, and much more! [Take a look](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview) and [contribute your own](https://huggingface.co/docs/diffusers/using-diffusers/contribute_pipeline).
 
 ## Other Examples
 
@@ -394,10 +395,14 @@ image.save("ddpm_generated_image.png")
 - [Unconditional Latent Diffusion](https://huggingface.co/CompVis/ldm-celebahq-256)
 - [Unconditional Diffusion with continuous scheduler](https://huggingface.co/google/ncsnpp-ffhq-1024)
 
-**Other Notebooks**:
+**Other Image Notebooks**:
 * [image-to-image generation with Stable Diffusion](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb) ![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg),
 * [tweak images via repeated Stable Diffusion seeds](https://colab.research.google.com/github/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb) ![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg),
 
+**Diffusers for Other Modalities**:
+* [Molecule conformation generation](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/geodiff_molecule_conformation.ipynb) ![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg),
+* [Model-based reinforcement learning](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/reinforcement_learning_with_diffusers.ipynb) ![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg),
+
 ### Web Demos
 If you just want to play around with some web demos, you can try out the following 🚀 Spaces:
 | Model                          	| Hugging Face Spaces                                                                                                                                               	|
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 4491a1eab657..c143dab9f5b7 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -31,6 +31,14 @@
     - local: using-diffusers/contribute_pipeline
       title: "How to contribute a Pipeline"
     title: "Pipelines for Inference"
+  - sections:
+    - local: using-diffusers/rl
+      title: "Reinforcement Learning"
+    - local: using-diffusers/audio
+      title: "Audio"
+    - local: using-diffusers/other-modalities
+      title: "Other Modalities"
+    title: "Taking Diffusers Beyond Images"
   title: "Using Diffusers"
 - sections:
   - local: optimization/fp16
@@ -107,4 +115,8 @@
     - local: api/pipelines/repaint
       title: "RePaint"
     title: "Pipelines"
+  - sections:
+    - local: api/experimental/rl
+      title: "RL Planning"
+    title: "Experimental Features"
   title: "API"
diff --git a/docs/source/api/experimental/rl.mdx b/docs/source/api/experimental/rl.mdx
new file mode 100644
index 000000000000..65abb06e7523
--- /dev/null
+++ b/docs/source/api/experimental/rl.mdx
@@ -0,0 +1,15 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# TODO
+
+Coming soon!
\ No newline at end of file
diff --git a/docs/source/using-diffusers/audio.mdx b/docs/source/using-diffusers/audio.mdx
new file mode 100644
index 000000000000..5a5c2241ca75
--- /dev/null
+++ b/docs/source/using-diffusers/audio.mdx
@@ -0,0 +1,16 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Using Diffusers for audio
+
+The [`DanceDiffusionPipeline`] can be used to generate audio rapidly!
+More coming soon!
\ No newline at end of file
diff --git a/docs/source/using-diffusers/other-modalities.mdx b/docs/source/using-diffusers/other-modalities.mdx
new file mode 100644
index 000000000000..1dc0877adb24
--- /dev/null
+++ b/docs/source/using-diffusers/other-modalities.mdx
@@ -0,0 +1,20 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Using Diffusers with other modalities
+
+Diffusers is in the process of expanding to modalities other than images.
+
+Currently, one example is for [molecule conformation](https://www.nature.com/subjects/molecular-conformation#:~:text=Definition,to%20changes%20in%20their%20environment.) generation.
+* Generate conformations in Colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/geodiff_molecule_conformation.ipynb)
+
+More coming soon!
\ No newline at end of file
diff --git a/docs/source/using-diffusers/rl.mdx b/docs/source/using-diffusers/rl.mdx
new file mode 100644
index 000000000000..6e18e07001b6
--- /dev/null
+++ b/docs/source/using-diffusers/rl.mdx
@@ -0,0 +1,18 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Using Diffusers for reinforcement learning
+
+Support for one RL model and related pipelines is included in the `experimental` source of diffusers.
+
+To try some of this in colab, please look at the following example:
+* Model-based reinforcement learning on Colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/reinforcement_learning_with_diffusers.ipynb) ![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)

From 5dcef138bf9addd53a9abc80c5436ea948bb22d0 Mon Sep 17 00:00:00 2001
From: Simon Kirsten <1972314+skirsten@users.noreply.github.com>
Date: Fri, 18 Nov 2022 10:31:07 +0000
Subject: [PATCH 212/269] [Flax] Fix loading scheduler from subfolder (#1319)

[FLAX] Fix loading scheduler from subfolder
---
 src/diffusers/schedulers/scheduling_utils_flax.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/schedulers/scheduling_utils_flax.py b/src/diffusers/schedulers/scheduling_utils_flax.py
index b3024ca450d0..5dc28c25d9d6 100644
--- a/src/diffusers/schedulers/scheduling_utils_flax.py
+++ b/src/diffusers/schedulers/scheduling_utils_flax.py
@@ -118,7 +118,10 @@ def from_pretrained(
 
         """
         config, kwargs = cls.load_config(
-            pretrained_model_name_or_path=pretrained_model_name_or_path, return_unused_kwargs=True, **kwargs
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            subfolder=subfolder,
+            return_unused_kwargs=True,
+            **kwargs,
         )
         scheduler, unused_kwargs = cls.from_config(config, return_unused_kwargs=True, **kwargs)
 

From fcfdd95f0b40e52c1b50f5b23eafb2423f1664ba Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 18 Nov 2022 12:32:17 +0100
Subject: [PATCH 213/269] Fix/Enable all schedulers for in-painting (#1331)

* inpaint fix k lms

* onnox as well

* up
---
 .../pipeline_onnx_stable_diffusion_inpaint.py |  2 +-
 .../pipeline_stable_diffusion_inpaint.py      |  3 +-
 .../test_stable_diffusion_inpaint.py          | 40 +++++++++++++++++++
 3 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
index ede30b556334..c353217d7562 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -408,8 +408,8 @@ def __call__(
             # expand the latents if we are doing classifier free guidance
             latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
             # concat latents, mask, masked_image_latnets in the channel dimension
-            latent_model_input = np.concatenate([latent_model_input, mask, masked_image_latents], axis=1)
             latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
+            latent_model_input = np.concatenate([latent_model_input, mask, masked_image_latents], axis=1)
             latent_model_input = latent_model_input.cpu().numpy()
 
             # predict the noise residual
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index e85e23869987..992b4ca272c2 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -586,9 +586,8 @@ def __call__(
             latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
 
             # concat latents, mask, masked_image_latents in the channel dimension
-            latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
-
             latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
 
             # predict the noise residual
             noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index ce231a1a460b..c6a976f4f2f0 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -22,6 +22,7 @@
 
 from diffusers import (
     AutoencoderKL,
+    LMSDiscreteScheduler,
     PNDMScheduler,
     StableDiffusionInpaintPipeline,
     UNet2DConditionModel,
@@ -421,6 +422,45 @@ def test_stable_diffusion_inpaint_pipeline_pndm(self):
         assert image.shape == (512, 512, 3)
         assert np.abs(expected_image - image).max() < 1e-2
 
+    def test_stable_diffusion_inpaint_pipeline_k_lms(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
+        )
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/in_paint"
+            "/yellow_cat_sitting_on_a_park_bench_k_lms.npy"
+        )
+
+        model_id = "runwayml/stable-diffusion-inpainting"
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None)
+        pipe.to(torch_device)
+
+        # switch to LMS
+        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            mask_image=mask_image,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 1e-2
+
     @unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU")
     def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
         torch.cuda.empty_cache()

From 195e437ac511f169d36b033f01e0536ce7ea1267 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 18 Nov 2022 12:32:49 +0100
Subject: [PATCH 214/269] Correct path to schedlure (#1322)

* [Examples] Correct path

* uP
---
 examples/dreambooth/train_dreambooth.py         | 2 +-
 examples/text_to_image/train_text_to_image.py   | 4 ++--
 examples/textual_inversion/textual_inversion.py | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index 610c18533b0b..13d0eb6ce0ac 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -472,7 +472,7 @@ def main(args):
         eps=args.adam_epsilon,
     )
 
-    noise_scheduler = DDPMScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
+    noise_scheduler = DDPMScheduler.from_config(args.pretrained_model_name_or_path, subfolder="scheduler")
 
     train_dataset = DreamBoothDataset(
         instance_data_root=args.instance_data_dir,
diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py
index cf7dac89330c..d615abb4643c 100644
--- a/examples/text_to_image/train_text_to_image.py
+++ b/examples/text_to_image/train_text_to_image.py
@@ -372,7 +372,7 @@ def main():
         weight_decay=args.adam_weight_decay,
         eps=args.adam_epsilon,
     )
-    noise_scheduler = DDPMScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
+    noise_scheduler = DDPMScheduler.from_config(args.pretrained_model_name_or_path, subfolder="scheduler")
 
     # Get the datasets: you can either provide your own training and evaluation files (see below)
     # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
@@ -605,7 +605,7 @@ def collate_fn(examples):
             vae=vae,
             unet=unet,
             tokenizer=tokenizer,
-            scheduler=PNDMScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler"),
+            scheduler=PNDMScheduler.from_config(args.pretrained_model_name_or_path, subfolder="scheduler"),
             safety_checker=StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker"),
             feature_extractor=CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32"),
         )
diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py
index 380ce90297fb..7d9fb7c0f175 100644
--- a/examples/textual_inversion/textual_inversion.py
+++ b/examples/textual_inversion/textual_inversion.py
@@ -441,7 +441,7 @@ def main():
         eps=args.adam_epsilon,
     )
 
-    noise_scheduler = DDPMScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
+    noise_scheduler = DDPMScheduler.from_config(args.pretrained_model_name_or_path, subfolder="scheduler")
 
     train_dataset = TextualInversionDataset(
         data_root=args.train_data_dir,
@@ -574,7 +574,7 @@ def main():
             vae=vae,
             unet=unet,
             tokenizer=tokenizer,
-            scheduler=PNDMScheduler.from_config("CompVis/stable-diffusion-v1-4", subfolder="scheduler"),
+            scheduler=PNDMScheduler.from_config(args.pretrained_model_name_or_path, subfolder="scheduler"),
             safety_checker=StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker"),
             feature_extractor=CLIPFeatureExtractor.from_pretrained("openai/clip-vit-base-patch32"),
         )

From 81fa2d688d6a04e834ecf9b12606f95d4308c3ea Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Fri, 18 Nov 2022 15:33:57 +0100
Subject: [PATCH 215/269] Avoid nested fix-copies (#1332)

* Avoid nested `# Copied from` statements during `make fix-copies`

* style
---
 .../alt_diffusion/pipeline_alt_diffusion_img2img.py   | 11 -----------
 utils/check_copies.py                                 |  4 ++++
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index fc530603ee32..9a50ef4a4e9c 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -81,7 +81,6 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline.__init__
     def __init__(
         self,
         vae: AutoencoderKL,
@@ -148,7 +147,6 @@ def __init__(
             feature_extractor=feature_extractor,
         )
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline.enable_attention_slicing
     def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
         r"""
         Enable sliced attention computation.
@@ -168,7 +166,6 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto
             slice_size = self.unet.config.attention_head_dim // 2
         self.unet.set_attention_slice(slice_size)
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline.disable_attention_slicing
     def disable_attention_slicing(self):
         r"""
         Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
@@ -177,7 +174,6 @@ def disable_attention_slicing(self):
         # set slice_size = `None` to disable `attention slicing`
         self.enable_attention_slicing(None)
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline.enable_sequential_cpu_offload
     def enable_sequential_cpu_offload(self, gpu_id=0):
         r"""
         Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
@@ -196,7 +192,6 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
                 cpu_offload(cpu_offloaded_model, device)
 
     @property
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline._execution_device
     def _execution_device(self):
         r"""
         Returns the device on which the pipeline's models will be executed. After calling
@@ -214,7 +209,6 @@ def _execution_device(self):
                 return torch.device(module._hf_hook.execution_device)
         return self.device
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline.enable_xformers_memory_efficient_attention
     def enable_xformers_memory_efficient_attention(self):
         r"""
         Enable memory efficient attention as implemented in xformers.
@@ -227,14 +221,12 @@ def enable_xformers_memory_efficient_attention(self):
         """
         self.unet.set_use_memory_efficient_attention_xformers(True)
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline.disable_xformers_memory_efficient_attention
     def disable_xformers_memory_efficient_attention(self):
         r"""
         Disable memory efficient attention as implemented in xformers.
         """
         self.unet.set_use_memory_efficient_attention_xformers(False)
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline._encode_prompt
     def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
         r"""
         Encodes the prompt into text encoder hidden states.
@@ -340,7 +332,6 @@ def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_fr
 
         return text_embeddings
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline.run_safety_checker
     def run_safety_checker(self, image, device, dtype):
         if self.safety_checker is not None:
             safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
@@ -351,7 +342,6 @@ def run_safety_checker(self, image, device, dtype):
             has_nsfw_concept = None
         return image, has_nsfw_concept
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline.decode_latents
     def decode_latents(self, latents):
         latents = 1 / 0.18215 * latents
         image = self.vae.decode(latents).sample
@@ -360,7 +350,6 @@ def decode_latents(self, latents):
         image = image.cpu().permute(0, 2, 3, 1).float().numpy()
         return image
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.AltDiffusionPipeline.prepare_extra_step_kwargs
     def prepare_extra_step_kwargs(self, generator, eta):
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
diff --git a/utils/check_copies.py b/utils/check_copies.py
index 395cefb9c48b..16782397da74 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -153,6 +153,10 @@ def is_copy_consistent(filename, overwrite=False):
         observed_code_lines = lines[start_index:line_index]
         observed_code = "".join(observed_code_lines)
 
+        # Remove any nested `Copied from` comments to avoid circular copies
+        theoretical_code = [line for line in theoretical_code.split("\n") if _re_copy_warning.search(line) is None]
+        theoretical_code = "\n".join(theoretical_code)
+
         # Before comparing, use the `replace_pattern` on the original code.
         if len(replace_pattern) > 0:
             patterns = replace_pattern.replace("with", "").split(",")

From aa2ce41b99a7990a8eb03bf2bf9253a40909b31e Mon Sep 17 00:00:00 2001
From: NotNANtoN <antonwiehe@gmail.com>
Date: Fri, 18 Nov 2022 16:01:57 +0100
Subject: [PATCH 216/269] Fix img2img speed with LMS-Discrete Scheduler (#896)

Casting `self.sigmas` into a different dtype (the one of original_samples) is not advisable. In my img2img pipeline this leads to a long running time in the  `integrate.quad` call later on- by long I mean more than 10x slower.

Co-authored-by: Anton Lozhkov <anton@huggingface.co>
---
 src/diffusers/schedulers/scheduling_lms_discrete.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py
index 8a9aedb41bf1..cc9e8d72566a 100644
--- a/src/diffusers/schedulers/scheduling_lms_discrete.py
+++ b/src/diffusers/schedulers/scheduling_lms_discrete.py
@@ -243,19 +243,18 @@ def add_noise(
         timesteps: torch.FloatTensor,
     ) -> torch.FloatTensor:
         # Make sure sigmas and timesteps have the same device and dtype as original_samples
-        self.sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
         if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
             # mps does not support float64
-            self.timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
             timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
         else:
-            self.timesteps = self.timesteps.to(original_samples.device)
+            schedule_timesteps = self.timesteps.to(original_samples.device)
             timesteps = timesteps.to(original_samples.device)
 
-        schedule_timesteps = self.timesteps
         step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
 
-        sigma = self.sigmas[step_indices].flatten()
+        sigma = sigmas[step_indices].flatten()
         while len(sigma.shape) < len(original_samples.shape):
             sigma = sigma.unsqueeze(-1)
 

From 72403181794604b4818acbfb2fdb3c8365a9d6ea Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Fri, 18 Nov 2022 16:30:07 +0100
Subject: [PATCH 217/269] Fix the order of casts for onnx inpainting (#1338)

---
 .../stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
index c353217d7562..b933c52bf6c4 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -409,8 +409,8 @@ def __call__(
             latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
             # concat latents, mask, masked_image_latnets in the channel dimension
             latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
-            latent_model_input = np.concatenate([latent_model_input, mask, masked_image_latents], axis=1)
             latent_model_input = latent_model_input.cpu().numpy()
+            latent_model_input = np.concatenate([latent_model_input, mask, masked_image_latents], axis=1)
 
             # predict the noise residual
             timestep = np.array([t], dtype=timestep_dtype)

From 30220905c4319e46e114cf7dc8047d94eca226f7 Mon Sep 17 00:00:00 2001
From: Clayton Sims <csims@dimagi.com>
Date: Fri, 18 Nov 2022 10:33:12 -0500
Subject: [PATCH 218/269] Legacy Inpainting Pipeline for Onnx Models (#1237)

* Add legacy inpainting pipeline compatibility for onnx

* remove commented out line

* Add onnx legacy inpainting test

* Fix slow decorators

* pep8 styling

* isort styling

* dummy object

* ordering consistency

* style

* docstring styles

* Refactor common prompt encoding pattern

* Update tests to permanent repository home

* support all available schedulers until ONNX IO binding is available

Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>

* updated styling from PR suggested feedback

Co-authored-by: Anton Lozhkov <aglozhkov@gmail.com>
---
 src/diffusers/__init__.py                     |   1 +
 src/diffusers/pipelines/__init__.py           |   1 +
 .../pipelines/stable_diffusion/__init__.py    |   1 +
 ...ne_onnx_stable_diffusion_inpaint_legacy.py | 447 ++++++++++++++++++
 ...torch_and_transformers_and_onnx_objects.py |  15 +
 ...st_onnx_stable_diffusion_inpaint_legacy.py |  95 ++++
 6 files changed, 560 insertions(+)
 create mode 100644 src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py
 create mode 100644 tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint_legacy.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 42cb2cb585c8..2ab62153634e 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -82,6 +82,7 @@
     from .pipelines import (
         OnnxStableDiffusionImg2ImgPipeline,
         OnnxStableDiffusionInpaintPipeline,
+        OnnxStableDiffusionInpaintPipelineLegacy,
         OnnxStableDiffusionPipeline,
         StableDiffusionOnnxPipeline,
     )
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 3ca66b28b5f1..7fc030dffb56 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -30,6 +30,7 @@
     from .stable_diffusion import (
         OnnxStableDiffusionImg2ImgPipeline,
         OnnxStableDiffusionInpaintPipeline,
+        OnnxStableDiffusionInpaintPipelineLegacy,
         OnnxStableDiffusionPipeline,
         StableDiffusionOnnxPipeline,
     )
diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
index 6623929f8648..fe813b07cc5b 100644
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -39,6 +39,7 @@ class StableDiffusionPipelineOutput(BaseOutput):
     from .pipeline_onnx_stable_diffusion import OnnxStableDiffusionPipeline, StableDiffusionOnnxPipeline
     from .pipeline_onnx_stable_diffusion_img2img import OnnxStableDiffusionImg2ImgPipeline
     from .pipeline_onnx_stable_diffusion_inpaint import OnnxStableDiffusionInpaintPipeline
+    from .pipeline_onnx_stable_diffusion_inpaint_legacy import OnnxStableDiffusionInpaintPipelineLegacy
 
 if is_transformers_available() and is_flax_available():
     import flax
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py
new file mode 100644
index 000000000000..34f1d0e95dc0
--- /dev/null
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py
@@ -0,0 +1,447 @@
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+
+import PIL
+from transformers import CLIPFeatureExtractor, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...onnx_utils import OnnxRuntimeModel
+from ...pipeline_utils import DiffusionPipeline
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import deprecate, logging
+from . import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def preprocess(image):
+    w, h = image.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    return 2.0 * image - 1.0
+
+
+def preprocess_mask(mask):
+    mask = mask.convert("L")
+    w, h = mask.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST)
+    mask = np.array(mask).astype(np.float32) / 255.0
+    mask = np.tile(mask, (4, 1, 1))
+    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
+    mask = 1 - mask  # repaint white, keep black
+    return mask
+
+
+class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
+    r"""
+    Pipeline for text-guided image inpainting using Stable Diffusion. This is a *legacy feature* for Onnx pipelines to
+    provide compatibility with StableDiffusionInpaintPipelineLegacy and may be removed in the future.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+    vae_encoder: OnnxRuntimeModel
+    vae_decoder: OnnxRuntimeModel
+    text_encoder: OnnxRuntimeModel
+    tokenizer: CLIPTokenizer
+    unet: OnnxRuntimeModel
+    scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
+    safety_checker: OnnxRuntimeModel
+    feature_extractor: CLIPFeatureExtractor
+
+    def __init__(
+        self,
+        vae_encoder: OnnxRuntimeModel,
+        vae_decoder: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: OnnxRuntimeModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: OnnxRuntimeModel,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae_encoder=vae_encoder,
+            vae_decoder=vae_decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline._encode_prompt
+    def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="np",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
+
+        if not np.array_equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        text_embeddings = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+        text_embeddings = np.repeat(text_embeddings, num_images_per_prompt, axis=0)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            uncond_embeddings = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+            uncond_embeddings = np.repeat(uncond_embeddings, num_images_per_prompt, axis=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        init_image: Union[np.ndarray, PIL.Image.Image],
+        mask_image: Union[np.ndarray, PIL.Image.Image],
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            init_image (`nd.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. This is the image whose masked region will be inpainted.
+            mask_image (`nd.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.uu
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
+                `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter will be modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (?) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if generator is None:
+            generator = np.random
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        if isinstance(init_image, PIL.Image.Image):
+            init_image = preprocess(init_image)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        text_embeddings = self._encode_prompt(
+            prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        latents_dtype = text_embeddings.dtype
+        init_image = init_image.astype(latents_dtype)
+
+        # encode the init image into latents and scale the latents
+        init_latents = self.vae_encoder(sample=init_image)[0]
+        init_latents = 0.18215 * init_latents
+
+        # Expand init_latents for batch_size and num_images_per_prompt
+        init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0)
+        init_latents_orig = init_latents
+
+        # preprocess mask
+        if not isinstance(mask_image, np.ndarray):
+            mask_image = preprocess_mask(mask_image)
+        mask_image = mask_image.astype(latents_dtype)
+        mask = np.concatenate([mask_image] * num_images_per_prompt, axis=0)
+
+        # check sizes
+        if not mask.shape == init_latents.shape:
+            raise ValueError("The mask and init_image should be the same size!")
+
+        # get the original timestep using init_timestep
+        offset = self.scheduler.config.get("steps_offset", 0)
+        init_timestep = int(num_inference_steps * strength) + offset
+        init_timestep = min(init_timestep, num_inference_steps)
+
+        timesteps = self.scheduler.timesteps.numpy()[-init_timestep]
+        timesteps = np.array([timesteps] * batch_size * num_images_per_prompt)
+
+        # add noise to latents using the timesteps
+        noise = generator.randn(*init_latents.shape).astype(latents_dtype)
+        init_latents = self.scheduler.add_noise(
+            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
+        )
+        init_latents = init_latents.numpy()
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (?) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to ? in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        latents = init_latents
+
+        t_start = max(num_inference_steps - init_timestep + offset, 0)
+        timesteps = self.scheduler.timesteps[t_start:].numpy()
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(
+                sample=latent_model_input, timestep=np.array([t]), encoder_hidden_states=text_embeddings
+            )[0]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
+            ).prev_sample
+
+            latents = latents.numpy()
+
+            init_latents_proper = self.scheduler.add_noise(
+                torch.from_numpy(init_latents_orig), torch.from_numpy(noise), torch.from_numpy(np.array([t]))
+            )
+
+            init_latents_proper = init_latents_proper.numpy()
+
+            latents = (init_latents_proper * mask) + (latents * (1 - mask))
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        # image = self.vae_decoder(latent_sample=latents)[0]
+        # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+        image = np.concatenate(
+            [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
+        )
+
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = image.transpose((0, 2, 3, 1))
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="np"
+            ).pixel_values.astype(image.dtype)
+            # There will throw an error if use safety_checker batchsize>1
+            images, has_nsfw_concept = [], []
+            for i in range(image.shape[0]):
+                image_i, has_nsfw_concept_i = self.safety_checker(
+                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+                )
+                images.append(image_i)
+                has_nsfw_concept.append(has_nsfw_concept_i[0])
+            image = np.concatenate(images)
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
index 221020030ea1..ae9412a95682 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
@@ -34,6 +34,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers", "onnx"])
 
 
+class OnnxStableDiffusionInpaintPipelineLegacy(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "onnx"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+
 class OnnxStableDiffusionPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers", "onnx"]
 
diff --git a/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint_legacy.py b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint_legacy.py
new file mode 100644
index 000000000000..577023f7055c
--- /dev/null
+++ b/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint_legacy.py
@@ -0,0 +1,95 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+from diffusers import OnnxStableDiffusionInpaintPipelineLegacy
+from diffusers.utils.testing_utils import (
+    is_onnx_available,
+    load_image,
+    load_numpy,
+    require_onnxruntime,
+    require_torch_gpu,
+    slow,
+)
+
+
+if is_onnx_available():
+    import onnxruntime as ort
+
+
+@slow
+@require_onnxruntime
+@require_torch_gpu
+class StableDiffusionOnnxInpaintLegacyPipelineIntegrationTests(unittest.TestCase):
+    @property
+    def gpu_provider(self):
+        return (
+            "CUDAExecutionProvider",
+            {
+                "gpu_mem_limit": "15000000000",  # 15GB
+                "arena_extend_strategy": "kSameAsRequested",
+            },
+        )
+
+    @property
+    def gpu_options(self):
+        options = ort.SessionOptions()
+        options.enable_mem_pattern = False
+        return options
+
+    def test_inference(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
+        )
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/red_cat_sitting_on_a_park_bench_onnx.npy"
+        )
+
+        # using the PNDM scheduler by default
+        pipe = OnnxStableDiffusionInpaintPipelineLegacy.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            revision="onnx",
+            provider=self.gpu_provider,
+            sess_options=self.gpu_options,
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A red cat sitting on a park bench"
+
+        generator = np.random.RandomState(0)
+        output = pipe(
+            prompt=prompt,
+            init_image=init_image,
+            mask_image=mask_image,
+            strength=0.75,
+            guidance_scale=7.5,
+            num_inference_steps=15,
+            generator=generator,
+            output_type="np",
+        )
+
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 1e-2

From 7bbbfbfd18ed9f5f6ce02bf194382a27150dd4c4 Mon Sep 17 00:00:00 2001
From: Juan Acevedo <juancevedo@gmail.com>
Date: Sat, 19 Nov 2022 11:51:52 -0800
Subject: [PATCH 219/269] Jax infer support negative prompt (#1337)

* support negative prompts in sd jax pipeline

* pass batched neg_prompt

* only encode when negative prompt is None

Co-authored-by: Juan Acevedo <jfacevedo@google.com>
---
 .../pipeline_flax_stable_diffusion.py         | 60 ++++++++++++++++---
 1 file changed, 52 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
index 02943997d928..a2f0f73dbf1f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
@@ -165,6 +165,7 @@ def _generate(
         guidance_scale: float = 7.5,
         latents: Optional[jnp.array] = None,
         debug: bool = False,
+        neg_prompt_ids: jnp.array = None,
     ):
         if height % 8 != 0 or width % 8 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
@@ -177,10 +178,14 @@ def _generate(
         batch_size = prompt_ids.shape[0]
 
         max_length = prompt_ids.shape[-1]
-        uncond_input = self.tokenizer(
-            [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np"
-        )
-        uncond_embeddings = self.text_encoder(uncond_input.input_ids, params=params["text_encoder"])[0]
+
+        if neg_prompt_ids is None:
+            uncond_input = self.tokenizer(
+                [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np"
+            ).input_ids
+        else:
+            uncond_input = neg_prompt_ids
+        uncond_embeddings = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
         context = jnp.concatenate([uncond_embeddings, text_embeddings])
 
         latents_shape = (batch_size, self.unet.in_channels, height // 8, width // 8)
@@ -251,6 +256,7 @@ def __call__(
         return_dict: bool = True,
         jit: bool = False,
         debug: bool = False,
+        neg_prompt_ids: jnp.array = None,
         **kwargs,
     ):
         r"""
@@ -298,11 +304,30 @@ def __call__(
         """
         if jit:
             images = _p_generate(
-                self, prompt_ids, params, prng_seed, num_inference_steps, height, width, guidance_scale, latents, debug
+                self,
+                prompt_ids,
+                params,
+                prng_seed,
+                num_inference_steps,
+                height,
+                width,
+                guidance_scale,
+                latents,
+                debug,
+                neg_prompt_ids,
             )
         else:
             images = self._generate(
-                prompt_ids, params, prng_seed, num_inference_steps, height, width, guidance_scale, latents, debug
+                prompt_ids,
+                params,
+                prng_seed,
+                num_inference_steps,
+                height,
+                width,
+                guidance_scale,
+                latents,
+                debug,
+                neg_prompt_ids,
             )
 
         if self.safety_checker is not None:
@@ -333,10 +358,29 @@ def __call__(
 # TODO: maybe use a config dict instead of so many static argnums
 @partial(jax.pmap, static_broadcasted_argnums=(0, 4, 5, 6, 7, 9))
 def _p_generate(
-    pipe, prompt_ids, params, prng_seed, num_inference_steps, height, width, guidance_scale, latents, debug
+    pipe,
+    prompt_ids,
+    params,
+    prng_seed,
+    num_inference_steps,
+    height,
+    width,
+    guidance_scale,
+    latents,
+    debug,
+    neg_prompt_ids,
 ):
     return pipe._generate(
-        prompt_ids, params, prng_seed, num_inference_steps, height, width, guidance_scale, latents, debug
+        prompt_ids,
+        params,
+        prng_seed,
+        num_inference_steps,
+        height,
+        width,
+        guidance_scale,
+        latents,
+        debug,
+        neg_prompt_ids,
     )
 
 

From 44efcbda0a31c5582644176e32dbe8bfdbd330c9 Mon Sep 17 00:00:00 2001
From: Ki <73854284+ki-arie@users.noreply.github.com>
Date: Mon, 21 Nov 2022 06:56:57 +1300
Subject: [PATCH 220/269] Update README.md: IMAGIC example code snippet
 misspelling (#1346)

Update README.md

Minor spelling mistake.
---
 examples/community/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/community/README.md b/examples/community/README.md
index dc35d36a953d..6e0faf574699 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -370,7 +370,7 @@ def dummy(images, **kwargs):
 pipe.safety_checker = dummy
 
 images = []
-generator = th.Generator("cuda").manual_seed(0)
+generator = torch.Generator("cuda").manual_seed(0)
 
 seed = 0
 prompt = "a forest | a camel"
@@ -652,4 +652,4 @@ prompt = "a cup"  # the masked out region will be replaced with this
 
 with autocast("cuda"):
     image = pipe(image=image, text=text, prompt=prompt).images[0]
-```
\ No newline at end of file
+```

From eb2425b88c8c34682520d575723e8500dbf8d6b0 Mon Sep 17 00:00:00 2001
From: Ki <73854284+ki-arie@users.noreply.github.com>
Date: Mon, 21 Nov 2022 06:59:56 +1300
Subject: [PATCH 221/269] Update README.md: Minor change to Imagic code
 snippet, missing dir error (#1347)

Minor change to Imagic Readme

Missing dir causes an error when running the example code.
---
 examples/community/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/community/README.md b/examples/community/README.md
index 6e0faf574699..778e84325ded 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -399,6 +399,7 @@ import requests
 from PIL import Image
 from io import BytesIO
 import torch
+import os
 from diffusers import DiffusionPipeline, DDIMScheduler
 has_cuda = torch.cuda.is_available()
 device = torch.device('cpu' if not has_cuda else 'cuda')
@@ -423,6 +424,7 @@ res = pipe.train(
     num_inference_steps=50,
     generator=generator)
 res = pipe(alpha=1)
+os.makedirs("imagic", exist_ok=True)
 image = res.images[0]
 image.save('./imagic/imagic_image_alpha_1.png')
 res = pipe(alpha=1.5)

From 3bec90ff2c50df133ced407d6c451c22e5155342 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <9283470+vict0rsch@users.noreply.github.com>
Date: Sun, 20 Nov 2022 13:33:09 -0500
Subject: [PATCH 222/269] Handle batches and Tensors in
 `pipeline_stable_diffusion_inpaint.py:prepare_mask_and_masked_image` (#1003)

* Handle batches and Tensors in `prepare_mask_and_masked_image`

* `blackfy`
upgrade `black`

* handle mask as `np.array`

* add docstring

* revert `black` changes with smaller line length

* missing ValueError in docstring

* raise `TypeError` for image as tensor but not mask

* typo in mask shape selection

* check for batch dim

* fix: wrong indentation

* add tests

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../pipeline_stable_diffusion_inpaint.py      |  97 +++++++++-
 .../test_stable_diffusion_inpaint.py          | 171 ++++++++++++++++++
 2 files changed, 258 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 992b4ca272c2..f1a613aaea7c 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -35,16 +35,93 @@
 
 
 def prepare_mask_and_masked_image(image, mask):
-    image = np.array(image.convert("RGB"))
-    image = image[None].transpose(0, 3, 1, 2)
-    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
-
-    mask = np.array(mask.convert("L"))
-    mask = mask.astype(np.float32) / 255.0
-    mask = mask[None, None]
-    mask[mask < 0.5] = 0
-    mask[mask >= 0.5] = 1
-    mask = torch.from_numpy(mask)
+    """
+    Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline.
+    This means that those inputs will be converted to ``torch.Tensor`` with
+    shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for
+    the ``image`` and ``1`` for the ``mask``.
+
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in
+    ``[-1, 1]``. The ``mask`` will be binarized (``mask > 0.5``) and cast to
+    ``torch.float32`` too.
+
+    Args:
+        image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array``
+            or a ``channels x height x width`` ``torch.Tensor`` or a
+            ``batch x channels x height x width`` ``torch.Tensor``.
+        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or
+            a ``1 x height x width`` ``torch.Tensor`` or a
+            ``batch x 1 x height x width`` ``torch.Tensor``.
+
+
+    Raises:
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range.
+        ValueError: ``torch.Tensor`` mask should be in the ``[0, 1]`` range.
+        ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+            (ot the other way around).
+
+    Returns:
+        tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
+            dimensions: ``batch x channels x height x width``.
+    """
+    if isinstance(image, torch.Tensor):
+        if not isinstance(mask, torch.Tensor):
+            raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
+
+        # Batch single image
+        if image.ndim == 3:
+            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
+            image = image.unsqueeze(0)
+
+        # Batch and add channel dim for single mask
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)
+
+        # Batch single mask or add channel dim
+        if mask.ndim == 3:
+            # Single batched mask, no channel dim or single mask not batched but channel dim
+            if mask.shape[0] == 1:
+                mask = mask.unsqueeze(0)
+
+            # Batched masks no channel dim
+            else:
+                mask = mask.unsqueeze(1)
+
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
+
+        # Check image is in [-1, 1]
+        if image.min() < -1 or image.max() > 1:
+            raise ValueError("Image should be in [-1, 1] range")
+
+        # Check mask is in [0, 1]
+        if mask.min() < 0 or mask.max() > 1:
+            raise ValueError("Mask should be in [0, 1] range")
+
+        # Binarize mask
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+
+        # Image as float32
+        image = image.to(dtype=torch.float32)
+    elif isinstance(mask, torch.Tensor):
+        raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
+    else:
+        if isinstance(image, PIL.Image.Image):
+            image = np.array(image.convert("RGB"))
+        image = image[None].transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+        if isinstance(mask, PIL.Image.Image):
+            mask = np.array(mask.convert("L"))
+            mask = mask.astype(np.float32) / 255.0
+        mask = mask[None, None]
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
 
     masked_image = image * (mask < 0.5)
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index c6a976f4f2f0..5c46d909e67a 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -29,7 +29,10 @@
     UNet2DModel,
     VQModel,
 )
+
 from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import prepare_mask_and_masked_image
+
 from diffusers.utils.testing_utils import require_torch_gpu
 from PIL import Image
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
@@ -506,3 +509,171 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
         mem_bytes = torch.cuda.max_memory_allocated()
         # make sure that less than 2.2 GB is allocated
         assert mem_bytes < 2.2 * 10**9
+
+class StableDiffusionInpaintingPrepareMaskAndMaskedImageTests(unittest.TestCase):
+    def test_pil_inputs(self):
+        im = np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8)
+        im = Image.fromarray(im)
+        mask = np.random.randint(0, 255, (32, 32), dtype=np.uint8) > 127.5
+        mask = Image.fromarray((mask * 255).astype(np.uint8))
+
+        t_mask, t_masked = prepare_mask_and_masked_image(im, mask)
+
+        self.assertTrue(isinstance(t_mask, torch.Tensor))
+        self.assertTrue(isinstance(t_masked, torch.Tensor))
+
+        self.assertEqual(t_mask.ndim, 4)
+        self.assertEqual(t_masked.ndim, 4)
+
+        self.assertEqual(t_mask.shape, (1, 1, 32, 32))
+        self.assertEqual(t_masked.shape, (1, 3, 32, 32))
+
+        self.assertTrue(t_mask.dtype == torch.float32)
+        self.assertTrue(t_masked.dtype == torch.float32)
+
+        self.assertTrue(t_mask.min() >= 0.0)
+        self.assertTrue(t_mask.max() <= 1.0)
+        self.assertTrue(t_masked.min() >= -1.0)
+        self.assertTrue(t_masked.min() <= 1.0)
+
+        self.assertTrue(t_mask.sum() > 0.0)
+
+    def test_np_inputs(self):
+        im_np = np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8)
+        im_pil = Image.fromarray(im_np)
+        mask_np = np.random.randint(0, 255, (32, 32), dtype=np.uint8) > 127.5
+        mask_pil = Image.fromarray((mask_np * 255).astype(np.uint8))
+
+        t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
+        t_mask_pil, t_masked_pil = prepare_mask_and_masked_image(im_pil, mask_pil)
+
+        self.assertTrue((t_mask_np == t_mask_pil).all())
+        self.assertTrue((t_masked_np == t_masked_pil).all())
+
+    def test_torch_3D_2D_inputs(self):
+        im_tensor = torch.randint(0, 255, (3, 32, 32), dtype=torch.uint8)
+        mask_tensor = torch.randint(0, 255, (32, 32), dtype=torch.uint8) > 127.5
+        im_np = im_tensor.numpy().transpose(1, 2, 0)
+        mask_np = mask_tensor.numpy()
+
+        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
+        t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
+
+        self.assertTrue((t_mask_tensor == t_mask_np).all())
+        self.assertTrue((t_masked_tensor == t_masked_np).all())
+
+    def test_torch_3D_3D_inputs(self):
+        im_tensor = torch.randint(0, 255, (3, 32, 32), dtype=torch.uint8)
+        mask_tensor = torch.randint(0, 255, (1, 32, 32), dtype=torch.uint8) > 127.5
+        im_np = im_tensor.numpy().transpose(1, 2, 0)
+        mask_np = mask_tensor.numpy()[0]
+
+        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
+        t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
+
+        self.assertTrue((t_mask_tensor == t_mask_np).all())
+        self.assertTrue((t_masked_tensor == t_masked_np).all())
+
+    def test_torch_4D_2D_inputs(self):
+        im_tensor = torch.randint(0, 255, (1, 3, 32, 32), dtype=torch.uint8)
+        mask_tensor = torch.randint(0, 255, (32, 32), dtype=torch.uint8) > 127.5
+        im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
+        mask_np = mask_tensor.numpy()
+
+        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
+        t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
+
+        self.assertTrue((t_mask_tensor == t_mask_np).all())
+        self.assertTrue((t_masked_tensor == t_masked_np).all())
+
+    def test_torch_4D_3D_inputs(self):
+        im_tensor = torch.randint(0, 255, (1, 3, 32, 32), dtype=torch.uint8)
+        mask_tensor = torch.randint(0, 255, (1, 32, 32), dtype=torch.uint8) > 127.5
+        im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
+        mask_np = mask_tensor.numpy()[0]
+
+        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
+        t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
+
+        self.assertTrue((t_mask_tensor == t_mask_np).all())
+        self.assertTrue((t_masked_tensor == t_masked_np).all())
+
+    def test_torch_4D_4D_inputs(self):
+        im_tensor = torch.randint(0, 255, (1, 3, 32, 32), dtype=torch.uint8)
+        mask_tensor = torch.randint(0, 255, (1, 1, 32, 32), dtype=torch.uint8) > 127.5
+        im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
+        mask_np = mask_tensor.numpy()[0][0]
+
+        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
+        t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
+
+        self.assertTrue((t_mask_tensor == t_mask_np).all())
+        self.assertTrue((t_masked_tensor == t_masked_np).all())
+
+    def test_torch_batch_4D_3D(self):
+        im_tensor = torch.randint(0, 255, (2, 3, 32, 32), dtype=torch.uint8)
+        mask_tensor = torch.randint(0, 255, (2, 32, 32), dtype=torch.uint8) > 127.5
+
+        im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor]
+        mask_nps = [mask.numpy() for mask in mask_tensor]
+
+        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
+        nps = [prepare_mask_and_masked_image(i, m) for i, m in zip(im_nps, mask_nps)]
+        t_mask_np = torch.cat([n[0] for n in nps])
+        t_masked_np = torch.cat([n[1] for n in nps])
+
+        self.assertTrue((t_mask_tensor == t_mask_np).all())
+        self.assertTrue((t_masked_tensor == t_masked_np).all())
+
+    def test_torch_batch_4D_4D(self):
+        im_tensor = torch.randint(0, 255, (2, 3, 32, 32), dtype=torch.uint8)
+        mask_tensor = torch.randint(0, 255, (2, 1, 32, 32), dtype=torch.uint8) > 127.5
+
+        im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor]
+        mask_nps = [mask.numpy()[0] for mask in mask_tensor]
+
+        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
+        nps = [prepare_mask_and_masked_image(i, m) for i, m in zip(im_nps, mask_nps)]
+        t_mask_np = torch.cat([n[0] for n in nps])
+        t_masked_np = torch.cat([n[1] for n in nps])
+
+        self.assertTrue((t_mask_tensor == t_mask_np).all())
+        self.assertTrue((t_masked_tensor == t_masked_np).all())
+
+    def test_shape_mismatch(self):
+        # test height and width
+        with self.assertRaises(AssertionError):
+            prepare_mask_and_masked_image(torch.randn(3, 32, 32), torch.randn(64, 64))
+        # test batch dim
+        with self.assertRaises(AssertionError):
+            prepare_mask_and_masked_image(torch.randn(2, 3, 32, 32), torch.randn(4, 64, 64))
+        # test batch dim
+        with self.assertRaises(AssertionError):
+            prepare_mask_and_masked_image(torch.randn(2, 3, 32, 32), torch.randn(4, 1, 64, 64))
+
+    def test_type_mismatch(self):
+        # test tensors-only
+        with self.assertRaises(TypeError):
+            prepare_mask_and_masked_image(torch.rand(3, 32, 32), torch.rand(3, 32, 32).numpy())
+        # test tensors-only
+        with self.assertRaises(TypeError):
+            prepare_mask_and_masked_image(torch.rand(3, 32, 32).numpy(), torch.rand(3, 32, 32))
+
+    def test_channels_first(self):
+        # test channels first for 3D tensors
+        with self.assertRaises(AssertionError):
+            prepare_mask_and_masked_image(torch.rand(32, 32, 3), torch.rand(3, 32, 32))
+
+    def test_tensor_range(self):
+        # test im <= 1
+        with self.assertRaises(ValueError):
+            prepare_mask_and_masked_image(torch.ones(3, 32, 32) * 2, torch.rand(32, 32))
+        # test im >= -1
+        with self.assertRaises(ValueError):
+            prepare_mask_and_masked_image(torch.ones(3, 32, 32) * (-2), torch.rand(32, 32))
+        # test mask <= 1
+        with self.assertRaises(ValueError):
+            prepare_mask_and_masked_image(torch.rand(3, 32, 32), torch.ones(32, 32) * 2)
+        # test mask >= 0
+        with self.assertRaises(ValueError):
+            prepare_mask_and_masked_image(torch.rand(3, 32, 32), torch.ones(32, 32) * -1)
\ No newline at end of file

From ab1f01e63415b63937736299d3a770554c83987e Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Sun, 20 Nov 2022 19:37:28 +0100
Subject: [PATCH 223/269] make style

---
 .../pipeline_stable_diffusion_inpaint.py      | 27 ++++++++-----------
 .../test_stable_diffusion_inpaint.py          |  7 +++--
 2 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index f1a613aaea7c..2058d972fb87 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -36,30 +36,25 @@
 
 def prepare_mask_and_masked_image(image, mask):
     """
-    Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline.
-    This means that those inputs will be converted to ``torch.Tensor`` with
-    shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for
-    the ``image`` and ``1`` for the ``mask``.
+    Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
+    converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
+    ``image`` and ``1`` for the ``mask``.
 
-    The ``image`` will be converted to ``torch.float32`` and normalized to be in
-    ``[-1, 1]``. The ``mask`` will be binarized (``mask > 0.5``) and cast to
-    ``torch.float32`` too.
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+    binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
 
     Args:
         image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
-            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array``
-            or a ``channels x height x width`` ``torch.Tensor`` or a
-            ``batch x channels x height x width`` ``torch.Tensor``.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+            ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
         mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
-            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or
-            a ``1 x height x width`` ``torch.Tensor`` or a
-            ``batch x 1 x height x width`` ``torch.Tensor``.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+            ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
 
 
     Raises:
-        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range.
-        ValueError: ``torch.Tensor`` mask should be in the ``[0, 1]`` range.
-        ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
         TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
             (ot the other way around).
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 5c46d909e67a..2f9348c5b566 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -29,10 +29,8 @@
     UNet2DModel,
     VQModel,
 )
-
-from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import prepare_mask_and_masked_image
-
+from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import require_torch_gpu
 from PIL import Image
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
@@ -510,6 +508,7 @@ def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
         # make sure that less than 2.2 GB is allocated
         assert mem_bytes < 2.2 * 10**9
 
+
 class StableDiffusionInpaintingPrepareMaskAndMaskedImageTests(unittest.TestCase):
     def test_pil_inputs(self):
         im = np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8)
@@ -676,4 +675,4 @@ def test_tensor_range(self):
             prepare_mask_and_masked_image(torch.rand(3, 32, 32), torch.ones(32, 32) * 2)
         # test mask >= 0
         with self.assertRaises(ValueError):
-            prepare_mask_and_masked_image(torch.rand(3, 32, 32), torch.ones(32, 32) * -1)
\ No newline at end of file
+            prepare_mask_and_masked_image(torch.rand(3, 32, 32), torch.ones(32, 32) * -1)

From 94b27fb8da1002a560449f4b8c0fc92e22115c40 Mon Sep 17 00:00:00 2001
From: shunxing1234 <33774367+shunxing1234@users.noreply.github.com>
Date: Mon, 21 Nov 2022 18:28:25 +0800
Subject: [PATCH 224/269] change the sample model (#1352)

* Update alt_diffusion.mdx

* Update alt_diffusion.mdx
---
 docs/source/api/pipelines/alt_diffusion.mdx | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/source/api/pipelines/alt_diffusion.mdx b/docs/source/api/pipelines/alt_diffusion.mdx
index 84dda88dcb34..4a75bc09bfa2 100644
--- a/docs/source/api/pipelines/alt_diffusion.mdx
+++ b/docs/source/api/pipelines/alt_diffusion.mdx
@@ -32,7 +32,7 @@ The abstract of the paper is the following:
 
 - *Run AltDiffusion*
 
-AltDiffusion can be tested very easily with the [`AltDiffusionPipeline`], [`AltDiffusionImg2ImgPipeline`] and the `"BAAI/AltDiffusion"` checkpoint exactly in the same way it is shown in the [Conditional Image Generation Guide](./using-diffusers/conditional_image_generation) and the [Image-to-Image Generation Guide](./using-diffusers/img2img).
+AltDiffusion can be tested very easily with the [`AltDiffusionPipeline`], [`AltDiffusionImg2ImgPipeline`] and the `"BAAI/AltDiffusion-m9"` checkpoint exactly in the same way it is shown in the [Conditional Image Generation Guide](./using-diffusers/conditional_image_generation) and the [Image-to-Image Generation Guide](./using-diffusers/img2img).
 
 - *How to load and use different schedulers.*
 
@@ -42,12 +42,12 @@ To use a different scheduler, you can either change it via the [`ConfigMixin.fro
 ```python
 >>> from diffusers import AltDiffusionPipeline, EulerDiscreteScheduler
 
->>> pipeline = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion")
+>>> pipeline = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion-m9")
 >>> pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
 
 >>> # or
->>> euler_scheduler = EulerDiscreteScheduler.from_pretrained("BAAI/AltDiffusion", subfolder="scheduler")
->>> pipeline = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", scheduler=euler_scheduler)
+>>> euler_scheduler = EulerDiscreteScheduler.from_pretrained("BAAI/AltDiffusion-m9", subfolder="scheduler")
+>>> pipeline = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion-m9", scheduler=euler_scheduler)
 ```
 
 
@@ -61,7 +61,7 @@ If you want to use all possible use cases in a single `DiffusionPipeline` we rec
 ...     AltDiffusionImg2ImgPipeline,
 ... )
 
->>> text2img = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion")
+>>> text2img = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion-m9")
 >>> img2img = AltDiffusionImg2ImgPipeline(**text2img.components)
 
 >>> # now you can use text2img(...) and img2img(...) just like the call methods of each respective pipeline

From 78a6eed2d7b082d8116c23061b57621cf909320f Mon Sep 17 00:00:00 2001
From: Stuti R <71293255+kingstut@users.noreply.github.com>
Date: Mon, 21 Nov 2022 04:50:32 -0600
Subject: [PATCH 225/269] Add bit diffusion [WIP] (#971)

* Create bit_diffusion.py

Bit diffusion based on the paper, arXiv:2208.04202, Chen2022AnalogBG

* adding bit diffusion to new branch

ran tests

* tests

* tests

* tests

* tests

* removed test folders + added to README

* Update README.md

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 examples/community/README.md        |  11 +-
 examples/community/bit_diffusion.py | 263 ++++++++++++++++++++++++++++
 2 files changed, 273 insertions(+), 1 deletion(-)
 create mode 100644 examples/community/bit_diffusion.py

diff --git a/examples/community/README.md b/examples/community/README.md
index 778e84325ded..b3063f109da1 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -21,6 +21,7 @@ If a community doesn't work as expected, please open an issue and ping the autho
 | Multilingual Stable Diffusion| Stable Diffusion Pipeline that supports prompts in 50 different languages.                                                                                                                                                                                                                                                                                                     | [Multilingual Stable Diffusion](#multilingual-stable-diffusion-pipeline)                                                                 | -                                                                                                                                                                                                                  |                        [Juan Carlos Piñeros](https://github.com/juancopi81) |
 | Image to Image Inpainting Stable Diffusion | Stable Diffusion Pipeline that enables the overlaying of two images and subsequent inpainting| [Image to Image Inpainting Stable Diffusion](#image-to-image-inpainting-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Alex McKinney](https://github.com/vvvm23) |
 | Text Based Inpainting Stable Diffusion | Stable Diffusion Inpainting Pipeline that enables passing a text prompt to generate the mask for inpainting| [Text Based Inpainting Stable Diffusion](#image-to-image-inpainting-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Dhruv Karan](https://github.com/unography) |
+| Bit Diffusion | Diffusion on discrete data | [Bit Diffusion](#bit-diffusion) | -  |[Stuti R.](https://github.com/kingstut) |
 
 
 
@@ -343,7 +344,6 @@ out = pipe(
 )
 ```
 
-
 ### Composable Stable diffusion 
 
 [Composable Stable Diffusion](https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/) proposes conjunction and negation (negative prompts) operators for compositional generation with conditional diffusion models.
@@ -655,3 +655,12 @@ prompt = "a cup"  # the masked out region will be replaced with this
 with autocast("cuda"):
     image = pipe(image=image, text=text, prompt=prompt).images[0]
 ```
+
+### Bit Diffusion 
+Based https://arxiv.org/abs/2208.04202, this is used for diffusion on discrete data - eg, discreate image data, DNA sequence data. An unconditional discreate image can be generated like this: 
+
+```python
+from diffusers import DiffusionPipeline
+pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="bit_diffusion")
+image = pipe().images[0]
+```
diff --git a/examples/community/bit_diffusion.py b/examples/community/bit_diffusion.py
new file mode 100644
index 000000000000..c0be3a13ad8d
--- /dev/null
+++ b/examples/community/bit_diffusion.py
@@ -0,0 +1,263 @@
+from typing import Optional, Tuple, Union
+
+import torch
+
+from diffusers import DDIMScheduler, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
+from diffusers.pipeline_utils import ImagePipelineOutput
+from diffusers.schedulers.scheduling_ddim import DDIMSchedulerOutput
+from diffusers.schedulers.scheduling_ddpm import DDPMSchedulerOutput
+from einops import rearrange, reduce
+
+
+BITS = 8
+
+
+# convert to bit representations and back taken from https://github.com/lucidrains/bit-diffusion/blob/main/bit_diffusion/bit_diffusion.py
+def decimal_to_bits(x, bits=BITS):
+    """expects image tensor ranging from 0 to 1, outputs bit tensor ranging from -1 to 1"""
+    device = x.device
+
+    x = (x * 255).int().clamp(0, 255)
+
+    mask = 2 ** torch.arange(bits - 1, -1, -1, device=device)
+    mask = rearrange(mask, "d -> d 1 1")
+    x = rearrange(x, "b c h w -> b c 1 h w")
+
+    bits = ((x & mask) != 0).float()
+    bits = rearrange(bits, "b c d h w -> b (c d) h w")
+    bits = bits * 2 - 1
+    return bits
+
+
+def bits_to_decimal(x, bits=BITS):
+    """expects bits from -1 to 1, outputs image tensor from 0 to 1"""
+    device = x.device
+
+    x = (x > 0).int()
+    mask = 2 ** torch.arange(bits - 1, -1, -1, device=device, dtype=torch.int32)
+
+    mask = rearrange(mask, "d -> d 1 1")
+    x = rearrange(x, "b (c d) h w -> b c d h w", d=8)
+    dec = reduce(x * mask, "b c d h w -> b c h w", "sum")
+    return (dec / 255).clamp(0.0, 1.0)
+
+
+# modified scheduler step functions for clamping the predicted x_0 between -bit_scale and +bit_scale
+def ddim_bit_scheduler_step(
+    self,
+    model_output: torch.FloatTensor,
+    timestep: int,
+    sample: torch.FloatTensor,
+    eta: float = 0.0,
+    use_clipped_model_output: bool = True,
+    generator=None,
+    return_dict: bool = True,
+) -> Union[DDIMSchedulerOutput, Tuple]:
+    """
+    Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+    process from the learned model outputs (most often the predicted noise).
+    Args:
+        model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+        timestep (`int`): current discrete timestep in the diffusion chain.
+        sample (`torch.FloatTensor`):
+            current instance of sample being created by diffusion process.
+        eta (`float`): weight of noise for added noise in diffusion step.
+        use_clipped_model_output (`bool`): TODO
+        generator: random number generator.
+        return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class
+    Returns:
+        [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] or `tuple`:
+        [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
+        returning a tuple, the first element is the sample tensor.
+    """
+    if self.num_inference_steps is None:
+        raise ValueError(
+            "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+        )
+
+    # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+    # Ideally, read DDIM paper in-detail understanding
+
+    # Notation (<variable name> -> <name in paper>
+    # - pred_noise_t -> e_theta(x_t, t)
+    # - pred_original_sample -> f_theta(x_t, t) or x_0
+    # - std_dev_t -> sigma_t
+    # - eta -> η
+    # - pred_sample_direction -> "direction pointing to x_t"
+    # - pred_prev_sample -> "x_t-1"
+
+    # 1. get previous step value (=t-1)
+    prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+    # 2. compute alphas, betas
+    alpha_prod_t = self.alphas_cumprod[timestep]
+    alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+
+    beta_prod_t = 1 - alpha_prod_t
+
+    # 3. compute predicted original sample from predicted noise also called
+    # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+
+    # 4. Clip "predicted x_0"
+    scale = self.bit_scale
+    if self.config.clip_sample:
+        pred_original_sample = torch.clamp(pred_original_sample, -scale, scale)
+
+    # 5. compute variance: "sigma_t(η)" -> see formula (16)
+    # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+    variance = self._get_variance(timestep, prev_timestep)
+    std_dev_t = eta * variance ** (0.5)
+
+    if use_clipped_model_output:
+        # the model_output is always re-derived from the clipped x_0 in Glide
+        model_output = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+
+    # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * model_output
+
+    # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+
+    if eta > 0:
+        # randn_like does not support generator https://github.com/pytorch/pytorch/issues/27072
+        device = model_output.device if torch.is_tensor(model_output) else "cpu"
+        noise = torch.randn(model_output.shape, dtype=model_output.dtype, generator=generator).to(device)
+        variance = self._get_variance(timestep, prev_timestep) ** (0.5) * eta * noise
+
+        prev_sample = prev_sample + variance
+
+    if not return_dict:
+        return (prev_sample,)
+
+    return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+
+def ddpm_bit_scheduler_step(
+    self,
+    model_output: torch.FloatTensor,
+    timestep: int,
+    sample: torch.FloatTensor,
+    predict_epsilon=True,
+    generator=None,
+    return_dict: bool = True,
+) -> Union[DDPMSchedulerOutput, Tuple]:
+    """
+    Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+    process from the learned model outputs (most often the predicted noise).
+    Args:
+        model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+        timestep (`int`): current discrete timestep in the diffusion chain.
+        sample (`torch.FloatTensor`):
+            current instance of sample being created by diffusion process.
+        predict_epsilon (`bool`):
+            optional flag to use when model predicts the samples directly instead of the noise, epsilon.
+        generator: random number generator.
+        return_dict (`bool`): option for returning tuple rather than DDPMSchedulerOutput class
+    Returns:
+        [`~schedulers.scheduling_utils.DDPMSchedulerOutput`] or `tuple`:
+        [`~schedulers.scheduling_utils.DDPMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
+        returning a tuple, the first element is the sample tensor.
+    """
+    t = timestep
+
+    if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
+        model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
+    else:
+        predicted_variance = None
+
+    # 1. compute alphas, betas
+    alpha_prod_t = self.alphas_cumprod[t]
+    alpha_prod_t_prev = self.alphas_cumprod[t - 1] if t > 0 else self.one
+    beta_prod_t = 1 - alpha_prod_t
+    beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+    # 2. compute predicted original sample from predicted noise also called
+    # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+    if predict_epsilon:
+        pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+    else:
+        pred_original_sample = model_output
+
+    # 3. Clip "predicted x_0"
+    scale = self.bit_scale
+    if self.config.clip_sample:
+        pred_original_sample = torch.clamp(pred_original_sample, -scale, scale)
+
+    # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
+    # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+    pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * self.betas[t]) / beta_prod_t
+    current_sample_coeff = self.alphas[t] ** (0.5) * beta_prod_t_prev / beta_prod_t
+
+    # 5. Compute predicted previous sample µ_t
+    # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+    pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
+
+    # 6. Add noise
+    variance = 0
+    if t > 0:
+        noise = torch.randn(
+            model_output.size(), dtype=model_output.dtype, layout=model_output.layout, generator=generator
+        ).to(model_output.device)
+        variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * noise
+
+    pred_prev_sample = pred_prev_sample + variance
+
+    if not return_dict:
+        return (pred_prev_sample,)
+
+    return DDPMSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
+
+
+class BitDiffusion(DiffusionPipeline):
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, DDPMScheduler],
+        bit_scale: Optional[float] = 1.0,
+    ):
+        super().__init__()
+        self.bit_scale = bit_scale
+        self.scheduler.step = (
+            ddim_bit_scheduler_step if isinstance(scheduler, DDIMScheduler) else ddpm_bit_scheduler_step
+        )
+
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        height: Optional[int] = 256,
+        width: Optional[int] = 256,
+        num_inference_steps: Optional[int] = 50,
+        generator: Optional[torch.Generator] = None,
+        batch_size: Optional[int] = 1,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[Tuple, ImagePipelineOutput]:
+        latents = torch.randn(
+            (batch_size, self.unet.in_channels, height, width),
+            generator=generator,
+        )
+        latents = decimal_to_bits(latents) * self.bit_scale
+        latents = latents.to(self.device)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            # predict the noise residual
+            noise_pred = self.unet(latents, t).sample
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+
+        image = bits_to_decimal(latents)
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)

From ad935933452f40a3832cfab77a4b568ac0916885 Mon Sep 17 00:00:00 2001
From: Birch-san <Birch-san@users.noreply.github.com>
Date: Mon, 21 Nov 2022 14:01:11 +0000
Subject: [PATCH 226/269] perf: prefer batched matmuls for attention (#1203)

perf: prefer batched matmuls for attention. added fast-path to Decoder when num_heads=1
---
 src/diffusers/models/attention.py | 92 +++++++++++++++++++------------
 1 file changed, 57 insertions(+), 35 deletions(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index be9203b4d699..69522f76b09c 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -284,22 +284,52 @@ def forward(self, hidden_states):
         key_proj = self.key(hidden_states)
         value_proj = self.value(hidden_states)
 
-        # transpose
-        query_states = self.transpose_for_scores(query_proj)
-        key_states = self.transpose_for_scores(key_proj)
-        value_states = self.transpose_for_scores(value_proj)
+        scale = 1 / math.sqrt(self.channels / self.num_heads)
 
         # get scores
-        scale = 1 / math.sqrt(math.sqrt(self.channels / self.num_heads))
-        attention_scores = torch.matmul(query_states * scale, key_states.transpose(-1, -2) * scale)  # TODO: use baddmm
+        if self.num_heads > 1:
+            query_states = self.transpose_for_scores(query_proj)
+            key_states = self.transpose_for_scores(key_proj)
+            value_states = self.transpose_for_scores(value_proj)
+
+            # TODO: is there a way to perform batched matmul (e.g. baddbmm) on 4D tensors?
+            #       or reformulate this into a 3D problem?
+            # TODO: measure whether on MPS device it would be faster to do this matmul via einsum
+            #       as some matmuls can be 1.94x slower than an equivalent einsum on MPS
+            #       https://gist.github.com/Birch-san/cba16789ec27bb20996a4b4831b13ce0
+            attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) * scale
+        else:
+            query_states, key_states, value_states = query_proj, key_proj, value_proj
+
+            attention_scores = torch.baddbmm(
+                torch.empty(
+                    query_states.shape[0],
+                    query_states.shape[1],
+                    key_states.shape[1],
+                    dtype=query_states.dtype,
+                    device=query_states.device,
+                ),
+                query_states,
+                key_states.transpose(-1, -2),
+                beta=0,
+                alpha=scale,
+            )
+
         attention_probs = torch.softmax(attention_scores.float(), dim=-1).type(attention_scores.dtype)
 
         # compute attention output
-        hidden_states = torch.matmul(attention_probs, value_states)
-
-        hidden_states = hidden_states.permute(0, 2, 1, 3).contiguous()
-        new_hidden_states_shape = hidden_states.size()[:-2] + (self.channels,)
-        hidden_states = hidden_states.view(new_hidden_states_shape)
+        if self.num_heads > 1:
+            # TODO: is there a way to perform batched matmul (e.g. bmm) on 4D tensors?
+            #       or reformulate this into a 3D problem?
+            # TODO: measure whether on MPS device it would be faster to do this matmul via einsum
+            #       as some matmuls can be 1.94x slower than an equivalent einsum on MPS
+            #       https://gist.github.com/Birch-san/cba16789ec27bb20996a4b4831b13ce0
+            hidden_states = torch.matmul(attention_probs, value_states)
+            hidden_states = hidden_states.permute(0, 2, 1, 3).contiguous()
+            new_hidden_states_shape = hidden_states.size()[:-2] + (self.channels,)
+            hidden_states = hidden_states.view(new_hidden_states_shape)
+        else:
+            hidden_states = torch.bmm(attention_probs, value_states)
 
         # compute next hidden_states
         hidden_states = self.proj_attn(hidden_states)
@@ -507,19 +537,17 @@ def forward(self, hidden_states, context=None, mask=None):
         return hidden_states
 
     def _attention(self, query, key, value):
-        # TODO: use baddbmm for better performance
-        if query.device.type == "mps":
-            # Better performance on mps (~20-25%)
-            attention_scores = torch.einsum("b i d, b j d -> b i j", query, key) * self.scale
-        else:
-            attention_scores = torch.matmul(query, key.transpose(-1, -2)) * self.scale
+        attention_scores = torch.baddbmm(
+            torch.empty(query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device),
+            query,
+            key.transpose(-1, -2),
+            beta=0,
+            alpha=self.scale,
+        )
         attention_probs = attention_scores.softmax(dim=-1)
         # compute attention output
 
-        if query.device.type == "mps":
-            hidden_states = torch.einsum("b i j, b j d -> b i d", attention_probs, value)
-        else:
-            hidden_states = torch.matmul(attention_probs, value)
+        hidden_states = torch.bmm(attention_probs, value)
 
         # reshape hidden_states
         hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
@@ -534,21 +562,15 @@ def _sliced_attention(self, query, key, value, sequence_length, dim):
         for i in range(hidden_states.shape[0] // slice_size):
             start_idx = i * slice_size
             end_idx = (i + 1) * slice_size
-            if query.device.type == "mps":
-                # Better performance on mps (~20-25%)
-                attn_slice = (
-                    torch.einsum("b i d, b j d -> b i j", query[start_idx:end_idx], key[start_idx:end_idx])
-                    * self.scale
-                )
-            else:
-                attn_slice = (
-                    torch.matmul(query[start_idx:end_idx], key[start_idx:end_idx].transpose(1, 2)) * self.scale
-                )  # TODO: use baddbmm for better performance
+            attn_slice = torch.baddbmm(
+                torch.empty(slice_size, query.shape[1], key.shape[1], dtype=query.dtype, device=query.device),
+                query[start_idx:end_idx],
+                key[start_idx:end_idx].transpose(-1, -2),
+                beta=0,
+                alpha=self.scale,
+            )
             attn_slice = attn_slice.softmax(dim=-1)
-            if query.device.type == "mps":
-                attn_slice = torch.einsum("b i j, b j d -> b i d", attn_slice, value[start_idx:end_idx])
-            else:
-                attn_slice = torch.matmul(attn_slice, value[start_idx:end_idx])
+            attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
 
             hidden_states[start_idx:end_idx] = attn_slice
 

From 182eb959e5efc8c77fa31394ca55376331c0ed25 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 21 Nov 2022 18:45:50 +0100
Subject: [PATCH 227/269] [Community Pipelines] K-Diffusion Pipeline (#1360)

* up

* add readme

* up

* uP
---
 examples/community/README.md                  |  62 +++
 examples/community/sd_text2img_k_diffusion.py | 479 ++++++++++++++++++
 2 files changed, 541 insertions(+)
 create mode 100755 examples/community/sd_text2img_k_diffusion.py

diff --git a/examples/community/README.md b/examples/community/README.md
index b3063f109da1..108f6f95f1b4 100644
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -22,6 +22,7 @@ If a community doesn't work as expected, please open an issue and ping the autho
 | Image to Image Inpainting Stable Diffusion | Stable Diffusion Pipeline that enables the overlaying of two images and subsequent inpainting| [Image to Image Inpainting Stable Diffusion](#image-to-image-inpainting-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Alex McKinney](https://github.com/vvvm23) |
 | Text Based Inpainting Stable Diffusion | Stable Diffusion Inpainting Pipeline that enables passing a text prompt to generate the mask for inpainting| [Text Based Inpainting Stable Diffusion](#image-to-image-inpainting-stable-diffusion)                                                                 | -                                                                                                                                                                                                                  |                        [Dhruv Karan](https://github.com/unography) |
 | Bit Diffusion | Diffusion on discrete data | [Bit Diffusion](#bit-diffusion) | -  |[Stuti R.](https://github.com/kingstut) |
+| K-Diffusion Stable Diffusion | Run Stable Diffusion with any of [K-Diffusion's samplers](https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/sampling.py) | [Stable Diffusion with K Diffusion](#stable-diffusion-with-k-diffusion) | -  | [Patrick von Platen](https://github.com/patrickvonplaten/) |
 
 
 
@@ -663,4 +664,65 @@ Based https://arxiv.org/abs/2208.04202, this is used for diffusion on discrete d
 from diffusers import DiffusionPipeline
 pipe = DiffusionPipeline.from_pretrained("google/ddpm-cifar10-32", custom_pipeline="bit_diffusion")
 image = pipe().images[0]
+
+```
+
+### Stable Diffusion with K Diffusion
+
+Make sure you have @crowsonkb's https://github.com/crowsonkb/k-diffusion installed:
+
 ```
+pip install k-diffusion
+```
+
+You can use the community pipeline as follows:
+
+```python
+from diffusers import DiffusionPipeline
+
+pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", custom_pipeline="sd_text2img_k_diffusion")
+pipe = pipe.to("cuda")
+
+prompt = "an astronaut riding a horse on mars"
+pipe.set_sampler("sample_heun")
+generator = torch.Generator(device="cuda").manual_seed(seed)
+image = pipe(prompt, generator=generator, num_inference_steps=20).images[0]
+
+image.save("./astronaut_heun_k_diffusion.png")
+```
+
+To make sure that K Diffusion and `diffusers` yield the same results:
+
+**Diffusers**:
+```python
+from diffusers import DiffusionPipeline, EulerDiscreteScheduler
+
+seed = 33
+
+pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to("cuda")
+
+generator = torch.Generator(device="cuda").manual_seed(seed)
+image = pipe(prompt, generator=generator, num_inference_steps=50).images[0]
+```
+
+![diffusers_euler](https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/k_diffusion/astronaut_euler.png)
+
+**K Diffusion**:
+```python
+from diffusers import DiffusionPipeline, EulerDiscreteScheduler
+
+seed = 33
+
+pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", custom_pipeline="sd_text2img_k_diffusion")
+pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to("cuda")
+
+pipe.set_sampler("sample_euler")
+generator = torch.Generator(device="cuda").manual_seed(seed)
+image = pipe(prompt, generator=generator, num_inference_steps=50).images[0]
+```
+
+![diffusers_euler](https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/k_diffusion/astronaut_euler_k_diffusion.png)
+
diff --git a/examples/community/sd_text2img_k_diffusion.py b/examples/community/sd_text2img_k_diffusion.py
new file mode 100755
index 000000000000..e993c70de987
--- /dev/null
+++ b/examples/community/sd_text2img_k_diffusion.py
@@ -0,0 +1,479 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+from typing import Callable, List, Optional, Union
+
+import torch
+
+from diffusers import LMSDiscreteScheduler
+from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.utils import is_accelerate_available, logging
+from k_diffusion.external import CompVisDenoiser
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class ModelWrapper:
+    def __init__(self, model, alphas_cumprod):
+        self.model = model
+        self.alphas_cumprod = alphas_cumprod
+
+    def apply_model(self, *args, **kwargs):
+        return self.model(*args, **kwargs).sample
+
+
+class StableDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae,
+        text_encoder,
+        tokenizer,
+        unet,
+        scheduler,
+        safety_checker,
+        feature_extractor,
+    ):
+        super().__init__()
+
+        if safety_checker is None:
+            logger.warn(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        # get correct sigmas from LMS
+        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+        model = ModelWrapper(unet, scheduler.alphas_cumprod)
+        self.k_diffusion_model = CompVisDenoiser(model)
+
+    def set_sampler(self, scheduler_type: str):
+        library = importlib.import_module("k_diffusion")
+        sampling = getattr(library, "sampling")
+        self.sampler = getattr(sampling, scheduler_type)
+
+    def enable_xformers_memory_efficient_attention(self):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(False)
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
+
+        if not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // 8, width // 8)
+        if latents is None:
+            if device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+            else:
+                latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = True
+        if guidance_scale <= 1.0:
+            raise ValueError("has to use guidance_scale")
+
+        # 3. Encode input prompt
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=text_embeddings.device)
+        sigmas = self.scheduler.sigmas
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        latents = latents * sigmas[0]
+        self.k_diffusion_model.sigmas = self.k_diffusion_model.sigmas.to(latents.device)
+        self.k_diffusion_model.log_sigmas = self.k_diffusion_model.log_sigmas.to(latents.device)
+
+        def model_fn(x, t):
+            latent_model_input = torch.cat([x] * 2)
+
+            noise_pred = self.k_diffusion_model(latent_model_input, t, encoder_hidden_states=text_embeddings)
+
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            return noise_pred
+
+        latents = self.sampler(model_fn, latents, sigmas)
+
+        # 8. Post-processing
+        image = self.decode_latents(latents)
+
+        # 9. Run safety checker
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
+
+        # 10. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

From e50c25d8081d483658245d792aca816e2fec49dd Mon Sep 17 00:00:00 2001
From: Manuel Brack <manuelbrack@gmx.de>
Date: Tue, 22 Nov 2022 11:51:30 +0100
Subject: [PATCH 228/269] Add Safe Stable Diffusion Pipeline (#1244)

* Add pipeline_stable_diffusion_safe.py to pipelines

* Fix repository consistency

Ran make fix-copies after adding new pipline

* Add Paper/Equation reference for parameters to doc string

* Ensure code style and quality

* Perform code refactoring

* Fix copies inherited from merge with huggingface/main

* Add docs

* Fix code style

* Fix errors in documentation

* Fix refactoring error

* remove debugging print statement

* added Safe Latent Diffusion tests

* Fix style

* Fix style

* Add pre-defined safety configurations

* Fix line-break

* fix some tests

* finish

* Change safety checker

* Add missing safety_checker.py file

* Remove unused imports

Co-authored-by: PatrickSchrML <patrick_schramowski@hotmail.de>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 docs/source/_toctree.yml                      |   2 +
 docs/source/api/pipelines/overview.mdx        |   3 +-
 .../api/pipelines/stable_diffusion_safe.mdx   |  90 +++
 docs/source/index.mdx                         |   1 +
 src/diffusers/__init__.py                     |   1 +
 src/diffusers/pipelines/__init__.py           |   1 +
 .../stable_diffusion_safe/__init__.py         |  72 ++
 .../pipeline_stable_diffusion_safe.py         | 721 ++++++++++++++++++
 .../stable_diffusion_safe/safety_checker.py   | 110 +++
 .../dummy_torch_and_transformers_objects.py   |  15 +
 .../stable_diffusion_safe/__init__.py         |   0
 .../test_safe_diffusion.py                    | 435 +++++++++++
 12 files changed, 1450 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/api/pipelines/stable_diffusion_safe.mdx
 create mode 100644 src/diffusers/pipelines/stable_diffusion_safe/__init__.py
 create mode 100644 src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
 create mode 100644 src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py
 create mode 100644 tests/pipelines/stable_diffusion_safe/__init__.py
 create mode 100644 tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index c143dab9f5b7..13687dc5419e 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -106,6 +106,8 @@
       title: "Score SDE VE"
     - local: api/pipelines/stable_diffusion
       title: "Stable Diffusion"
+    - local: api/pipelines/stable_diffusion_safe
+      title: "Safe Stable Diffusion"
     - local: api/pipelines/stochastic_karras_ve
       title: "Stochastic Karras VE"
     - local: api/pipelines/dance_diffusion
diff --git a/docs/source/api/pipelines/overview.mdx b/docs/source/api/pipelines/overview.mdx
index 74c44fbccdf5..ff83bad05525 100644
--- a/docs/source/api/pipelines/overview.mdx
+++ b/docs/source/api/pipelines/overview.mdx
@@ -58,7 +58,8 @@ available a colab notebook to directly try them out.
 | [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
 | [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
 | [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
-| [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation | 
+| [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
+| [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation |
 | [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation | 
 
 
diff --git a/docs/source/api/pipelines/stable_diffusion_safe.mdx b/docs/source/api/pipelines/stable_diffusion_safe.mdx
new file mode 100644
index 000000000000..81fc59d3928c
--- /dev/null
+++ b/docs/source/api/pipelines/stable_diffusion_safe.mdx
@@ -0,0 +1,90 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Safe Stable Diffusion
+
+Safe Stable Diffusion was proposed in [Safe Latent Diffusion: Mitigating Inappropriate Degeneration in Diffusion Models](https://arxiv.org/abs/2211.05105) and mitigates the well known issue that models like Stable Diffusion that are trained on unfiltered, web-crawled datasets tend to suffer from inappropriate degeneration. For instance Stable Diffusion may unexpectedly generate nudity, violence, images depicting self-harm, or otherwise offensive content.
+Safe Stable Diffusion is an extension to the Stable Diffusion that drastically reduces content like this.
+
+The abstract of the paper is the following:
+
+*Text-conditioned image generation models have recently achieved astonishing results in image quality and text alignment and are consequently employed in a fast-growing number of applications. Since they are highly data-driven, relying on billion-sized datasets randomly scraped from the internet, they also suffer, as we demonstrate, from degenerated and biased human behavior. In turn, they may even reinforce such biases. To help combat these undesired side effects, we present safe latent diffusion (SLD). Specifically, to measure the inappropriate degeneration due to unfiltered and imbalanced training sets, we establish a novel image generation test bed-inappropriate image prompts (I2P)-containing dedicated, real-world image-to-text prompts covering concepts such as nudity and violence. As our exhaustive empirical evaluation demonstrates, the introduced SLD removes and suppresses inappropriate image parts during the diffusion process, with no additional training required and no adverse effect on overall image quality or text alignment.*
+
+
+*Overview*:
+
+| Pipeline | Tasks | Colab | Demo
+|---|---|:---:|:---:|
+| [pipeline_stable_diffusion_safe.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py) | *Text-to-Image Generation* |  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb) | -
+
+## Tips
+
+- Safe Stable Diffusion may also be used with weights of [Stable Diffusion](./api/pipelines/stable_diffusion).
+
+### Run Safe Stable Diffusion
+
+Safe Stable Diffusion can be tested very easily with the [`StableDiffusionPipelineSafe`], and the `"AIML-TUDA/stable-diffusion-safe"` checkpoint exactly in the same way it is shown in the [Conditional Image Generation Guide](./using-diffusers/conditional_image_generation).
+
+### Interacting with the Safety Concept
+
+To check and edit the currently used safety concept, use the `safety_concept` property of [`StableDiffusionPipelineSafe`]
+```python
+>>> from diffusers import StableDiffusionPipelineSafe
+
+>>> pipeline = StableDiffusionPipelineSafe.from_pretrained("AIML-TUDA/stable-diffusion-safe")
+>>> pipeline.safety_concept
+```
+For each image generation the active concept is also contained in [`StableDiffusionSafePipelineOutput`].
+
+### Using pre-defined safety configurations
+
+You may use the 4 configurations defined in the [Safe Latent Diffusion paper](https://arxiv.org/abs/2211.05105) as follows:
+
+```python
+>>> from diffusers import StableDiffusionPipelineSafe
+>>> from diffusers.pipelines.stable_diffusion_safe import SafetyConfig
+
+>>> pipeline = StableDiffusionPipelineSafe.from_pretrained("AIML-TUDA/stable-diffusion-safe")
+>>> prompt = "the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c. leyendecker"
+>>> out = pipeline(prompt=prompt, **SafetyConfig.MAX)
+```
+
+The following configurations are available: `SafetyConfig.WEAK`, `SafetyConfig.MEDIUM`, `SafetyConfig.STRONg`, and `SafetyConfig.MAX`.
+
+### How to load and use different schedulers.
+
+The safe stable diffusion pipeline uses [`PNDMScheduler`] scheduler by default. But `diffusers` provides many other schedulers that can be used with the stable diffusion pipeline such as [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`] etc.
+To use a different scheduler, you can either change it via the [`ConfigMixin.from_config`] method or pass the `scheduler` argument to the `from_pretrained` method of the pipeline. For example, to use the [`EulerDiscreteScheduler`], you can do the following:
+
+```python
+>>> from diffusers import StableDiffusionPipelineSafe, EulerDiscreteScheduler
+
+>>> pipeline = StableDiffusionPipelineSafe.from_pretrained("AIML-TUDA/stable-diffusion-safe")
+>>> pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
+
+>>> # or
+>>> euler_scheduler = EulerDiscreteScheduler.from_pretrained("AIML-TUDA/stable-diffusion-safe", subfolder="scheduler")
+>>> pipeline = StableDiffusionPipelineSafe.from_pretrained(
+...     "AIML-TUDA/stable-diffusion-safe", scheduler=euler_scheduler
+... )
+```
+
+
+## StableDiffusionSafePipelineOutput
+[[autodoc]] pipelines.stable_diffusion_safe.StableDiffusionSafePipelineOutput
+
+## StableDiffusionPipelineSafe
+[[autodoc]] StableDiffusionPipelineSafe
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index e4722bec6897..1c5ecc5fe3cb 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -48,6 +48,7 @@ available a colab notebook to directly try them out.
 | [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
 | [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
 | [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+| [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
 | [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation | 
 | [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation | 
 
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 2ab62153634e..80936d1f6945 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -73,6 +73,7 @@
         StableDiffusionInpaintPipeline,
         StableDiffusionInpaintPipelineLegacy,
         StableDiffusionPipeline,
+        StableDiffusionPipelineSafe,
         VQDiffusionPipeline,
     )
 else:
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 7fc030dffb56..d61dc1316f32 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -24,6 +24,7 @@
         StableDiffusionInpaintPipelineLegacy,
         StableDiffusionPipeline,
     )
+    from .stable_diffusion_safe import StableDiffusionPipelineSafe
     from .vq_diffusion import VQDiffusionPipeline
 
 if is_transformers_available() and is_onnx_available():
diff --git a/src/diffusers/pipelines/stable_diffusion_safe/__init__.py b/src/diffusers/pipelines/stable_diffusion_safe/__init__.py
new file mode 100644
index 000000000000..59ff61fa3b54
--- /dev/null
+++ b/src/diffusers/pipelines/stable_diffusion_safe/__init__.py
@@ -0,0 +1,72 @@
+from dataclasses import dataclass
+from enum import Enum
+from typing import List, Optional, Union
+
+import numpy as np
+
+import PIL
+from PIL import Image
+
+from ...utils import BaseOutput, is_torch_available, is_transformers_available
+
+
+@dataclass
+class SafetyConfig(object):
+    WEAK = {
+        "sld_warmup_steps": 15,
+        "sld_guidance_scale": 20,
+        "sld_threshold": 0.0,
+        "sld_momentum_scale": 0.0,
+        "sld_mom_beta": 0.0,
+    }
+    MEDIUM = {
+        "sld_warmup_steps": 10,
+        "sld_guidance_scale": 1000,
+        "sld_threshold": 0.01,
+        "sld_momentum_scale": 0.3,
+        "sld_mom_beta": 0.4,
+    }
+    STRONG = {
+        "sld_warmup_steps": 7,
+        "sld_guidance_scale": 2000,
+        "sld_threshold": 0.025,
+        "sld_momentum_scale": 0.5,
+        "sld_mom_beta": 0.7,
+    }
+    MAX = {
+        "sld_warmup_steps": 0,
+        "sld_guidance_scale": 5000,
+        "sld_threshold": 1.0,
+        "sld_momentum_scale": 0.5,
+        "sld_mom_beta": 0.7,
+    }
+
+
+@dataclass
+class StableDiffusionSafePipelineOutput(BaseOutput):
+    """
+    Output class for Safe Stable Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+        nsfw_content_detected (`List[bool]`)
+            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, or `None` if safety checking could not be performed.
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images that were flagged by the safety checker any may contain "not-safe-for-work"
+            (nsfw) content, or `None` if no safety check was performed or no images were flagged.
+        applied_safety_concept (`str`)
+            The safety concept that was applied for safety guidance, or `None` if safety guidance was disabled
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
+    unsafe_images: Optional[Union[List[PIL.Image.Image], np.ndarray]]
+    applied_safety_concept: Optional[str]
+
+
+if is_transformers_available() and is_torch_available():
+    from .pipeline_stable_diffusion_safe import StableDiffusionPipelineSafe
+    from .safety_checker import SafeStableDiffusionSafetyChecker
diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
new file mode 100644
index 000000000000..421bfd1c9ff5
--- /dev/null
+++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
@@ -0,0 +1,721 @@
+import inspect
+import warnings
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...pipeline_utils import DiffusionPipeline
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ...utils import deprecate, is_accelerate_available, logging
+from . import StableDiffusionSafePipelineOutput
+from .safety_checker import SafeStableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class StableDiffusionPipelineSafe(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Safe Latent Diffusion.
+
+    The implementation is based on the [`StableDiffusionPipeline`]
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[
+            DDIMScheduler,
+            DPMSolverMultistepScheduler,
+            EulerAncestralDiscreteScheduler,
+            EulerDiscreteScheduler,
+            LMSDiscreteScheduler,
+            PNDMScheduler,
+        ],
+        safety_checker: SafeStableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+        safety_concept: Optional[str] = (
+            "an image showing hate, harassment, violence, suffering, humiliation, harm, suicide, sexual, nudity,"
+            " bodily fluids, blood, obscene gestures, illegal activity, drug use, theft, vandalism, weapons, child"
+            " abuse, brutality, cruelty"
+        )
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None:
+            logger.warn(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self._safety_text_concept = safety_concept
+
+    @property
+    def safety_concept(self):
+        r"""
+        Getter method for the safety concept used with SLD
+
+        Returns:
+            `str`: The text describing the safety concept
+        """
+        return self._safety_text_concept
+
+    @safety_concept.setter
+    def safety_concept(self, concept):
+        r"""
+        Setter method for the safety concept used with SLD
+
+        Args:
+            concept (`str`):
+                The text of the new safety concept
+        """
+        self._safety_text_concept = concept
+
+    def enable_xformers_memory_efficient_attention(self):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(False)
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    def enable_sequential_cpu_offload(self):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device("cuda")
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt,
+        enable_safety_guidance,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
+
+        if not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # Encode the safety concept text
+            if enable_safety_guidance:
+                safety_concept_input = self.tokenizer(
+                    [self._safety_text_concept],
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                safety_embeddings = self.text_encoder(safety_concept_input.input_ids.to(self.device))[0]
+
+                # duplicate safety embeddings for each generation per prompt, using mps friendly method
+                seq_len = safety_embeddings.shape[1]
+                safety_embeddings = safety_embeddings.repeat(batch_size, num_images_per_prompt, 1)
+                safety_embeddings = safety_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+                # For classifier free guidance + sld, we need to do three forward passes.
+                # Here we concatenate the unconditional and text embeddings into a single batch
+                # to avoid doing three forward passes
+                text_embeddings = torch.cat([uncond_embeddings, text_embeddings, safety_embeddings])
+
+            else:
+                # For classifier free guidance, we need to do two forward passes.
+                # Here we concatenate the unconditional and text embeddings into a single batch
+                # to avoid doing two forward passes
+                text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
+    def run_safety_checker(self, image, device, dtype, enable_safety_guidance):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+            flagged_images = None
+            if any(has_nsfw_concept):
+                logger.warning(
+                    "Potential NSFW content was detected in one or more images. A black image will be returned"
+                    " instead."
+                    f" {'You may look at this images in the `unsafe_images` variable of the output at your own discretion.' if enable_safety_guidance else 'Try again with a different prompt and/or seed.'} "
+                )
+                flagged_images = np.zeros((2, *image.shape[1:]))
+                for idx, has_nsfw_concept in enumerate(has_nsfw_concept):
+                    if has_nsfw_concept:
+                        flagged_images[idx] = image[idx]
+                        image[idx] = np.zeros(image[idx].shape)  # black image
+        else:
+            has_nsfw_concept = None
+            flagged_images = None
+        return image, has_nsfw_concept, flagged_images
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // 8, width // 8)
+        if latents is None:
+            if device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+            else:
+                latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def perform_safety_guidance(
+        self,
+        enable_safety_guidance,
+        safety_momentum,
+        noise_guidance,
+        noise_pred_out,
+        i,
+        sld_guidance_scale,
+        sld_warmup_steps,
+        sld_threshold,
+        sld_momentum_scale,
+        sld_mom_beta,
+    ):
+        # Perform SLD guidance
+        if enable_safety_guidance:
+            if safety_momentum is None:
+                safety_momentum = torch.zeros_like(noise_guidance)
+            noise_pred_text, noise_pred_uncond = noise_pred_out[0], noise_pred_out[1]
+            noise_pred_safety_concept = noise_pred_out[2]
+
+            # Equation 6
+            scale = torch.clamp(torch.abs((noise_pred_text - noise_pred_safety_concept)) * sld_guidance_scale, max=1.0)
+
+            # Equation 6
+            safety_concept_scale = torch.where(
+                (noise_pred_text - noise_pred_safety_concept) >= sld_threshold, torch.zeros_like(scale), scale
+            )
+
+            # Equation 4
+            noise_guidance_safety = torch.mul((noise_pred_safety_concept - noise_pred_uncond), safety_concept_scale)
+
+            # Equation 7
+            noise_guidance_safety = noise_guidance_safety + sld_momentum_scale * safety_momentum
+
+            # Equation 8
+            safety_momentum = sld_mom_beta * safety_momentum + (1 - sld_mom_beta) * noise_guidance_safety
+
+            if i >= sld_warmup_steps:  # Warmup
+                # Equation 3
+                noise_guidance = noise_guidance - noise_guidance_safety
+        return noise_guidance, safety_momentum
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        sld_guidance_scale: Optional[float] = 1000,
+        sld_warmup_steps: Optional[int] = 10,
+        sld_threshold: Optional[float] = 0.01,
+        sld_momentum_scale: Optional[float] = 0.3,
+        sld_mom_beta: Optional[float] = 0.4,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            sld_guidance_scale (`float`, *optional*, defaults to 1000):
+                Safe latent guidance as defined in [Safe Latent Diffusion](https://arxiv.org/abs/2211.05105).
+                `sld_guidance_scale` is defined as sS of Eq. 6. If set to be less than 1, safety guidance will be
+                disabled.
+            sld_warmup_steps (`int`, *optional*, defaults to 10):
+                Number of warmup steps for safety guidance. SLD will only be applied for diffusion steps greater than
+                `sld_warmup_steps`. `sld_warmup_steps` is defined as `delta` of [Safe Latent
+                Diffusion](https://arxiv.org/abs/2211.05105).
+            sld_threshold (`float`, *optional*, defaults to 0.01):
+                Threshold that separates the hyperplane between appropriate and inappropriate images. `sld_threshold`
+                is defined as `lamda` of Eq. 5 in [Safe Latent Diffusion](https://arxiv.org/abs/2211.05105).
+            sld_momentum_scale (`float`, *optional*, defaults to 0.3):
+                Scale of the SLD momentum to be added to the safety guidance at each diffusion step. If set to 0.0
+                momentum will be disabled. Momentum is already built up during warmup, i.e. for diffusion steps smaller
+                than `sld_warmup_steps`. `sld_momentum_scale` is defined as `sm` of Eq. 7 in [Safe Latent
+                Diffusion](https://arxiv.org/abs/2211.05105).
+            sld_mom_beta (`float`, *optional*, defaults to 0.4):
+                Defines how safety guidance momentum builds up. `sld_mom_beta` indicates how much of the previous
+                momentum will be kept. Momentum is already built up during warmup, i.e. for diffusion steps smaller
+                than `sld_warmup_steps`. `sld_mom_beta` is defined as `beta m` of Eq. 8 in [Safe Latent
+                Diffusion](https://arxiv.org/abs/2211.05105).
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        enable_safety_guidance = sld_guidance_scale > 1.0 and do_classifier_free_guidance
+        if not enable_safety_guidance:
+            warnings.warn("Safety checker disabled!")
+
+        # 3. Encode input prompt
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt, enable_safety_guidance
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        safety_momentum = None
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = (
+                torch.cat([latents] * (3 if enable_safety_guidance else 2)) if do_classifier_free_guidance else latents
+            )
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_out = noise_pred.chunk((3 if enable_safety_guidance else 2))
+                noise_pred_uncond, noise_pred_text = noise_pred_out[0], noise_pred_out[1]
+
+                # default classifier free guidance
+                noise_guidance = noise_pred_text - noise_pred_uncond
+
+                # Perform SLD guidance
+                if enable_safety_guidance:
+                    if safety_momentum is None:
+                        safety_momentum = torch.zeros_like(noise_guidance)
+                    noise_pred_safety_concept = noise_pred_out[2]
+
+                    # Equation 6
+                    scale = torch.clamp(
+                        torch.abs((noise_pred_text - noise_pred_safety_concept)) * sld_guidance_scale, max=1.0
+                    )
+
+                    # Equation 6
+                    safety_concept_scale = torch.where(
+                        (noise_pred_text - noise_pred_safety_concept) >= sld_threshold, torch.zeros_like(scale), scale
+                    )
+
+                    # Equation 4
+                    noise_guidance_safety = torch.mul(
+                        (noise_pred_safety_concept - noise_pred_uncond), safety_concept_scale
+                    )
+
+                    # Equation 7
+                    noise_guidance_safety = noise_guidance_safety + sld_momentum_scale * safety_momentum
+
+                    # Equation 8
+                    safety_momentum = sld_mom_beta * safety_momentum + (1 - sld_mom_beta) * noise_guidance_safety
+
+                    if i >= sld_warmup_steps:  # Warmup
+                        # Equation 3
+                        noise_guidance = noise_guidance - noise_guidance_safety
+
+                noise_pred = noise_pred_uncond + guidance_scale * noise_guidance
+
+                # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        # 8. Post-processing
+        image = self.decode_latents(latents)
+
+        # 9. Run safety checker
+        image, has_nsfw_concept, flagged_images = self.run_safety_checker(
+            image, device, text_embeddings.dtype, enable_safety_guidance
+        )
+
+        # 10. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+            if flagged_images is not None:
+                flagged_images = self.numpy_to_pil(flagged_images)
+
+        if not return_dict:
+            return (
+                image,
+                has_nsfw_concept,
+                self._safety_text_concept if enable_safety_guidance else None,
+                flagged_images,
+            )
+
+        return StableDiffusionSafePipelineOutput(
+            images=image,
+            nsfw_content_detected=has_nsfw_concept,
+            applied_safety_concept=self._safety_text_concept if enable_safety_guidance else None,
+            unsafe_images=flagged_images,
+        )
diff --git a/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py b/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py
new file mode 100644
index 000000000000..f9dbf51e8644
--- /dev/null
+++ b/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py
@@ -0,0 +1,110 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+
+from transformers import CLIPConfig, CLIPVisionModel, PreTrainedModel
+
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def cosine_distance(image_embeds, text_embeds):
+    normalized_image_embeds = nn.functional.normalize(image_embeds)
+    normalized_text_embeds = nn.functional.normalize(text_embeds)
+    return torch.mm(normalized_image_embeds, normalized_text_embeds.t())
+
+
+class SafeStableDiffusionSafetyChecker(PreTrainedModel):
+    config_class = CLIPConfig
+
+    _no_split_modules = ["CLIPEncoderLayer"]
+
+    def __init__(self, config: CLIPConfig):
+        super().__init__(config)
+
+        self.vision_model = CLIPVisionModel(config.vision_config)
+        self.visual_projection = nn.Linear(config.vision_config.hidden_size, config.projection_dim, bias=False)
+
+        self.concept_embeds = nn.Parameter(torch.ones(17, config.projection_dim), requires_grad=False)
+        self.special_care_embeds = nn.Parameter(torch.ones(3, config.projection_dim), requires_grad=False)
+
+        self.concept_embeds_weights = nn.Parameter(torch.ones(17), requires_grad=False)
+        self.special_care_embeds_weights = nn.Parameter(torch.ones(3), requires_grad=False)
+
+    @torch.no_grad()
+    def forward(self, clip_input, images):
+        pooled_output = self.vision_model(clip_input)[1]  # pooled_output
+        image_embeds = self.visual_projection(pooled_output)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds).cpu().float().numpy()
+        cos_dist = cosine_distance(image_embeds, self.concept_embeds).cpu().float().numpy()
+
+        result = []
+        batch_size = image_embeds.shape[0]
+        for i in range(batch_size):
+            result_img = {"special_scores": {}, "special_care": [], "concept_scores": {}, "bad_concepts": []}
+
+            # increase this value to create a stronger `nfsw` filter
+            # at the cost of increasing the possibility of filtering benign images
+            adjustment = 0.0
+
+            for concept_idx in range(len(special_cos_dist[0])):
+                concept_cos = special_cos_dist[i][concept_idx]
+                concept_threshold = self.special_care_embeds_weights[concept_idx].item()
+                result_img["special_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
+                if result_img["special_scores"][concept_idx] > 0:
+                    result_img["special_care"].append({concept_idx, result_img["special_scores"][concept_idx]})
+                    adjustment = 0.01
+
+            for concept_idx in range(len(cos_dist[0])):
+                concept_cos = cos_dist[i][concept_idx]
+                concept_threshold = self.concept_embeds_weights[concept_idx].item()
+                result_img["concept_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
+                if result_img["concept_scores"][concept_idx] > 0:
+                    result_img["bad_concepts"].append(concept_idx)
+
+            result.append(result_img)
+
+        has_nsfw_concepts = [len(res["bad_concepts"]) > 0 for res in result]
+
+        return images, has_nsfw_concepts
+
+    @torch.no_grad()
+    def forward_onnx(self, clip_input: torch.FloatTensor, images: torch.FloatTensor):
+        pooled_output = self.vision_model(clip_input)[1]  # pooled_output
+        image_embeds = self.visual_projection(pooled_output)
+
+        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds)
+        cos_dist = cosine_distance(image_embeds, self.concept_embeds)
+
+        # increase this value to create a stronger `nsfw` filter
+        # at the cost of increasing the possibility of filtering benign images
+        adjustment = 0.0
+
+        special_scores = special_cos_dist - self.special_care_embeds_weights + adjustment
+        # special_scores = special_scores.round(decimals=3)
+        special_care = torch.any(special_scores > 0, dim=1)
+        special_adjustment = special_care * 0.01
+        special_adjustment = special_adjustment.unsqueeze(1).expand(-1, cos_dist.shape[1])
+
+        concept_scores = (cos_dist - self.concept_embeds_weights) + special_adjustment
+        # concept_scores = concept_scores.round(decimals=3)
+        has_nsfw_concepts = torch.any(concept_scores > 0, dim=1)
+
+        return images, has_nsfw_concepts
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 92c163ba74ea..ddb93f5d5fa5 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -124,6 +124,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class StableDiffusionPipelineSafe(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class VQDiffusionPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/pipelines/stable_diffusion_safe/__init__.py b/tests/pipelines/stable_diffusion_safe/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py b/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
new file mode 100644
index 000000000000..dcb3f2730360
--- /dev/null
+++ b/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
@@ -0,0 +1,435 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion_safe import StableDiffusionPipelineSafe as StableDiffusionPipeline
+from diffusers.utils import floats_tensor, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class SafeDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @property
+    def dummy_image(self):
+        batch_size = 1
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
+        return image
+
+    @property
+    def dummy_cond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
+    @property
+    def dummy_vae(self):
+        torch.manual_seed(0)
+        model = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        return model
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModel(config)
+
+    @property
+    def dummy_extractor(self):
+        def extract(*args, **kwargs):
+            class Out:
+                def __init__(self):
+                    self.pixel_values = torch.ones([0])
+
+                def to(self, device):
+                    self.pixel_values.to(device)
+                    return self
+
+            return Out()
+
+        return extract
+
+    def test_safe_diffusion_ddim(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.5112, 0.4692, 0.4715, 0.5206, 0.4894, 0.5114, 0.5096, 0.4932, 0.4755])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_pndm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.4937, 0.4649, 0.4716, 0.5145, 0.4889, 0.513, 0.513, 0.4905, 0.4738])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_no_safety_checker(self):
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
+        )
+        assert isinstance(pipe, StableDiffusionPipeline)
+        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
+        assert pipe.safety_checker is None
+
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None
+
+        # check that there's no error when saving a pipeline with one of the models being None
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
+
+        # sanity check that the pipeline still works
+        assert pipe.safety_checker is None
+        image = pipe("example prompt", num_inference_steps=2).images[0]
+        assert image is not None
+
+    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
+    def test_stable_diffusion_fp16(self):
+        """Test that stable diffusion works with fp16"""
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # put models in fp16
+        unet = unet.half()
+        vae = vae.half()
+        bert = bert.half()
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images
+
+        assert image.shape == (1, 128, 128, 3)
+
+
+@slow
+@require_torch_gpu
+class SafeDiffusionPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_harm_safe_stable_diffusion(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = (
+            "portrait of girl with smokey eyes makeup in abandoned hotel, grange clothes, redshift, wide high angle"
+            " coloured polaroid photograph with flash, kodak film, hyper real, stunning moody cinematography, with"
+            " anamorphic lenses, by maripol, fallen angels by wong kar - wai, style of suspiria and neon demon and"
+            " children from bahnhof zoo, detailed "
+        )
+        seed = 4003660346
+        guidance_scale = 7
+
+        # without safety guidance (sld_guidance_scale = 0)
+        generator = torch.Generator(device=torch_device).manual_seed(seed)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+            sld_guidance_scale=0,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = [0.2278, 0.2231, 0.2249, 0.2333, 0.2303, 0.1885, 0.2273, 0.2144, 0.2176]
+
+        assert image.shape == (1, 512, 512, 3)
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+        # without safety guidance (strong configuration)
+        generator = torch.Generator(device=torch_device).manual_seed(seed)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+            sld_guidance_scale=2000,
+            sld_warmup_steps=7,
+            sld_threshold=0.025,
+            sld_momentum_scale=0.5,
+            sld_mom_beta=0.7,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = [0.2383, 0.2276, 0.236, 0.2192, 0.2186, 0.2053, 0.1971, 0.1901, 0.1719]
+
+        assert image.shape == (1, 512, 512, 3)
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_nudity_safe_stable_diffusion(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
+        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "padme amidala taking a bath artwork, safe for work, no nudity"
+        seed = 2734971755
+        guidance_scale = 7
+
+        generator = torch.Generator(device=torch_device).manual_seed(seed)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+            sld_guidance_scale=0,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = [0.3502, 0.3622, 0.3396, 0.3642, 0.3478, 0.3318, 0.35, 0.3348, 0.3297]
+
+        assert image.shape == (1, 512, 512, 3)
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+        generator = torch.Generator(device=torch_device).manual_seed(seed)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+            sld_guidance_scale=2000,
+            sld_warmup_steps=7,
+            sld_threshold=0.025,
+            sld_momentum_scale=0.5,
+            sld_mom_beta=0.7,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = [0.5531, 0.5206, 0.4895, 0.5156, 0.5182, 0.4751, 0.4802, 0.4803, 0.4443]
+
+        assert image.shape == (1, 512, 512, 3)
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_nudity_safetychecker_safe_stable_diffusion(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = (
+            "the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c."
+            " leyendecker"
+        )
+        seed = 1044355234
+        guidance_scale = 12
+
+        generator = torch.Generator(device=torch_device).manual_seed(seed)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+            sld_guidance_scale=0,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
+
+        assert image.shape == (1, 512, 512, 3)
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-7
+
+        generator = torch.Generator(device=torch_device).manual_seed(seed)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=50,
+            output_type="np",
+            width=512,
+            height=512,
+            sld_guidance_scale=2000,
+            sld_warmup_steps=7,
+            sld_threshold=0.025,
+            sld_momentum_scale=0.5,
+            sld_mom_beta=0.7,
+        )
+
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+        expected_slice = np.array([0.5818, 0.6285, 0.6835, 0.6019, 0.625, 0.6754, 0.6096, 0.6334, 0.6561])
+        assert image.shape == (1, 512, 512, 3)
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

From 8b84f8519264942fa0e52444881390767cb766c5 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Tue, 22 Nov 2022 13:35:23 +0100
Subject: [PATCH 229/269] [examples] fix mixed_precision arg (#1359)

* use accelerator to check mixed_precision

* default `mixed_precision` to `None`

* pass mixed_precision to accelerate launch
---
 examples/dreambooth/README.md                 |  5 ++---
 examples/dreambooth/train_dreambooth.py       | 12 ++++++------
 examples/text_to_image/README.md              |  6 ++----
 examples/text_to_image/train_text_to_image.py | 12 ++++++------
 4 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/examples/dreambooth/README.md b/examples/dreambooth/README.md
index 2339e2979dbd..7aaf1bc46c22 100644
--- a/examples/dreambooth/README.md
+++ b/examples/dreambooth/README.md
@@ -141,7 +141,7 @@ export INSTANCE_DIR="path-to-instance-images"
 export CLASS_DIR="path-to-class-images"
 export OUTPUT_DIR="path-to-save-model"
 
-accelerate launch train_dreambooth.py \
+accelerate launch --mixed_precision="fp16" train_dreambooth.py \
   --pretrained_model_name_or_path=$MODEL_NAME \
   --instance_data_dir=$INSTANCE_DIR \
   --class_data_dir=$CLASS_DIR \
@@ -157,8 +157,7 @@ accelerate launch train_dreambooth.py \
   --lr_scheduler="constant" \
   --lr_warmup_steps=0 \
   --num_class_images=200 \
-  --max_train_steps=800 \
-  --mixed_precision=fp16
+  --max_train_steps=800
 ```
 
 ### Fine-tune text encoder with the UNet.
diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py
index 13d0eb6ce0ac..1f6c730f2baf 100644
--- a/examples/dreambooth/train_dreambooth.py
+++ b/examples/dreambooth/train_dreambooth.py
@@ -187,12 +187,12 @@ def parse_args(input_args=None):
     parser.add_argument(
         "--mixed_precision",
         type=str,
-        default="no",
+        default=None,
         choices=["no", "fp16", "bf16"],
         help=(
-            "Whether to use mixed precision. Choose"
-            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
-            "and an Nvidia Ampere GPU."
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
         ),
     )
     parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
@@ -538,9 +538,9 @@ def collate_fn(examples):
         )
 
     weight_dtype = torch.float32
-    if args.mixed_precision == "fp16":
+    if accelerator.mixed_precision == "fp16":
         weight_dtype = torch.float16
-    elif args.mixed_precision == "bf16":
+    elif accelerator.mixed_precision == "bf16":
         weight_dtype = torch.bfloat16
 
     # Move text_encode and vae to gpu.
diff --git a/examples/text_to_image/README.md b/examples/text_to_image/README.md
index 170ed384f13c..abe21875848a 100644
--- a/examples/text_to_image/README.md
+++ b/examples/text_to_image/README.md
@@ -46,7 +46,7 @@ With `gradient_checkpointing` and `mixed_precision` it should be possible to fin
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
 export dataset_name="lambdalabs/pokemon-blip-captions"
 
-accelerate launch train_text_to_image.py \
+accelerate launch --mixed_precision="fp16"  train_text_to_image.py \
   --pretrained_model_name_or_path=$MODEL_NAME \
   --dataset_name=$dataset_name \
   --use_ema \
@@ -54,7 +54,6 @@ accelerate launch train_text_to_image.py \
   --train_batch_size=1 \
   --gradient_accumulation_steps=4 \
   --gradient_checkpointing \
-  --mixed_precision="fp16" \
   --max_train_steps=15000 \
   --learning_rate=1e-05 \
   --max_grad_norm=1 \
@@ -70,7 +69,7 @@ If you wish to use custom loading logic, you should modify the script, we have l
 export MODEL_NAME="CompVis/stable-diffusion-v1-4"
 export TRAIN_DIR="path_to_your_dataset"
 
-accelerate launch train_text_to_image.py \
+accelerate launch --mixed_precision="fp16" train_text_to_image.py \
   --pretrained_model_name_or_path=$MODEL_NAME \
   --train_data_dir=$TRAIN_DIR \
   --use_ema \
@@ -78,7 +77,6 @@ accelerate launch train_text_to_image.py \
   --train_batch_size=1 \
   --gradient_accumulation_steps=4 \
   --gradient_checkpointing \
-  --mixed_precision="fp16" \
   --max_train_steps=15000 \
   --learning_rate=1e-05 \
   --max_grad_norm=1 \
diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py
index d615abb4643c..88da2a55096a 100644
--- a/examples/text_to_image/train_text_to_image.py
+++ b/examples/text_to_image/train_text_to_image.py
@@ -186,12 +186,12 @@ def parse_args():
     parser.add_argument(
         "--mixed_precision",
         type=str,
-        default="no",
+        default=None,
         choices=["no", "fp16", "bf16"],
         help=(
-            "Whether to use mixed precision. Choose"
-            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
-            "and an Nvidia Ampere GPU."
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
         ),
     )
     parser.add_argument(
@@ -496,9 +496,9 @@ def collate_fn(examples):
     )
 
     weight_dtype = torch.float32
-    if args.mixed_precision == "fp16":
+    if accelerator.mixed_precision == "fp16":
         weight_dtype = torch.float16
-    elif args.mixed_precision == "bf16":
+    elif accelerator.mixed_precision == "bf16":
         weight_dtype = torch.bfloat16
 
     # Move text_encode and vae to gpu.

From 2d6d4edbbdb3c6d7013df1db9369634355a75846 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Tue, 22 Nov 2022 13:37:17 +0100
Subject: [PATCH 230/269] use memory_efficient_attention by default (#1354)

* use memory_efficient_attention by default

* Update src/diffusers/models/attention.py

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 src/diffusers/models/attention.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 69522f76b09c..7e11bde27347 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
+import warnings
 from dataclasses import dataclass
 from typing import Optional
 
@@ -396,6 +397,16 @@ def __init__(
             self.norm2 = nn.LayerNorm(dim)
         self.norm3 = nn.LayerNorm(dim)
 
+        # if xformers is installed try to use memory_efficient_attention by default
+        if is_xformers_available():
+            try:
+                self._set_use_memory_efficient_attention_xformers(True)
+            except Exception as e:
+                warnings.warn(
+                    "Could not enable memory efficient attention. Make sure xformers is installed"
+                    f" correctly and a GPU is available: {e}"
+                )
+
     def _set_attention_slice(self, slice_size):
         self.attn1._slice_size = slice_size
         self.attn2._slice_size = slice_size

From 44e56de9aaaa103ad11ca2953dc86ba6f64ba5d4 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Tue, 22 Nov 2022 20:44:34 +0100
Subject: [PATCH 231/269] Replace logger.warn by logger.warning (#1366)

---
 examples/community/img2img_inpainting.py                    | 2 +-
 examples/community/interpolate_stable_diffusion.py          | 2 +-
 examples/community/lpw_stable_diffusion.py                  | 2 +-
 examples/community/multilingual_stable_diffusion.py         | 2 +-
 examples/community/sd_text2img_k_diffusion.py               | 2 +-
 examples/community/speech_to_image_diffusion.py             | 2 +-
 examples/community/text_inpainting.py                       | 2 +-
 examples/community/wildcard_stable_diffusion.py             | 2 +-
 src/diffusers/modeling_utils.py                             | 2 +-
 src/diffusers/pipeline_flax_utils.py                        | 4 ++--
 src/diffusers/pipeline_utils.py                             | 6 +++---
 .../pipelines/alt_diffusion/pipeline_alt_diffusion.py       | 2 +-
 .../alt_diffusion/pipeline_alt_diffusion_img2img.py         | 2 +-
 .../pipelines/stable_diffusion/pipeline_cycle_diffusion.py  | 2 +-
 .../stable_diffusion/pipeline_flax_stable_diffusion.py      | 2 +-
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py | 2 +-
 .../stable_diffusion/pipeline_stable_diffusion_img2img.py   | 2 +-
 .../stable_diffusion/pipeline_stable_diffusion_inpaint.py   | 2 +-
 .../pipeline_stable_diffusion_inpaint_legacy.py             | 2 +-
 .../stable_diffusion_safe/pipeline_stable_diffusion_safe.py | 2 +-
 .../schedulers/scheduling_euler_ancestral_discrete.py       | 2 +-
 src/diffusers/schedulers/scheduling_euler_discrete.py       | 2 +-
 22 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/examples/community/img2img_inpainting.py b/examples/community/img2img_inpainting.py
index f7a107136d89..3fa7db13a482 100644
--- a/examples/community/img2img_inpainting.py
+++ b/examples/community/img2img_inpainting.py
@@ -110,7 +110,7 @@ def __init__(
             scheduler._internal_dict = FrozenDict(new_config)
 
         if safety_checker is None:
-            logger.warn(
+            logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
                 " results in services or applications open to the public. Both the diffusers team and Hugging Face"
diff --git a/examples/community/interpolate_stable_diffusion.py b/examples/community/interpolate_stable_diffusion.py
index 761aaeca69c1..4d7a73f5ba69 100644
--- a/examples/community/interpolate_stable_diffusion.py
+++ b/examples/community/interpolate_stable_diffusion.py
@@ -101,7 +101,7 @@ def __init__(
             scheduler._internal_dict = FrozenDict(new_config)
 
         if safety_checker is None:
-            logger.warn(
+            logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
                 " results in services or applications open to the public. Both the diffusers team and Hugging Face"
diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py
index b952ffe76df1..0e7dc9e1ed11 100644
--- a/examples/community/lpw_stable_diffusion.py
+++ b/examples/community/lpw_stable_diffusion.py
@@ -469,7 +469,7 @@ def __init__(
             scheduler._internal_dict = FrozenDict(new_config)
 
         if safety_checker is None:
-            logger.warn(
+            logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
                 " results in services or applications open to the public. Both the diffusers team and Hugging Face"
diff --git a/examples/community/multilingual_stable_diffusion.py b/examples/community/multilingual_stable_diffusion.py
index c71c1f10c588..19974d6df08b 100644
--- a/examples/community/multilingual_stable_diffusion.py
+++ b/examples/community/multilingual_stable_diffusion.py
@@ -113,7 +113,7 @@ def __init__(
             scheduler._internal_dict = FrozenDict(new_config)
 
         if safety_checker is None:
-            logger.warn(
+            logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
                 " results in services or applications open to the public. Both the diffusers team and Hugging Face"
diff --git a/examples/community/sd_text2img_k_diffusion.py b/examples/community/sd_text2img_k_diffusion.py
index e993c70de987..9592f7879ff0 100755
--- a/examples/community/sd_text2img_k_diffusion.py
+++ b/examples/community/sd_text2img_k_diffusion.py
@@ -77,7 +77,7 @@ def __init__(
         super().__init__()
 
         if safety_checker is None:
-            logger.warn(
+            logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
                 " results in services or applications open to the public. Both the diffusers team and Hugging Face"
diff --git a/examples/community/speech_to_image_diffusion.py b/examples/community/speech_to_image_diffusion.py
index 1a9d296e81f3..17bc08e3c291 100644
--- a/examples/community/speech_to_image_diffusion.py
+++ b/examples/community/speech_to_image_diffusion.py
@@ -42,7 +42,7 @@ def __init__(
         super().__init__()
 
         if safety_checker is None:
-            logger.warn(
+            logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
                 " results in services or applications open to the public. Both the diffusers team and Hugging Face"
diff --git a/examples/community/text_inpainting.py b/examples/community/text_inpainting.py
index 38d5e963372d..a4368f8b4365 100644
--- a/examples/community/text_inpainting.py
+++ b/examples/community/text_inpainting.py
@@ -99,7 +99,7 @@ def __init__(
             scheduler._internal_dict = FrozenDict(new_config)
 
         if safety_checker is None:
-            logger.warn(
+            logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
                 " results in services or applications open to the public. Both the diffusers team and Hugging Face"
diff --git a/examples/community/wildcard_stable_diffusion.py b/examples/community/wildcard_stable_diffusion.py
index 9ad0d8e9fa04..282be8e48b7d 100644
--- a/examples/community/wildcard_stable_diffusion.py
+++ b/examples/community/wildcard_stable_diffusion.py
@@ -135,7 +135,7 @@ def __init__(
             scheduler._internal_dict = FrozenDict(new_config)
 
         if safety_checker is None:
-            logger.warn(
+            logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
                 " results in services or applications open to the public. Both the diffusers team and Hugging Face"
diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py
index 1e91ccd56a15..704ba00cad29 100644
--- a/src/diffusers/modeling_utils.py
+++ b/src/diffusers/modeling_utils.py
@@ -332,7 +332,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
         if low_cpu_mem_usage and not is_accelerate_available():
             low_cpu_mem_usage = False
-            logger.warn(
+            logger.warning(
                 "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
                 " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
                 " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
diff --git a/src/diffusers/pipeline_flax_utils.py b/src/diffusers/pipeline_flax_utils.py
index 54bb028139fb..bf2e259ea1a7 100644
--- a/src/diffusers/pipeline_flax_utils.py
+++ b/src/diffusers/pipeline_flax_utils.py
@@ -411,13 +411,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                             f" {expected_class_obj}"
                         )
                 elif passed_class_obj[name] is None:
-                    logger.warn(
+                    logger.warning(
                         f"You have passed `None` for {name} to disable its functionality in {pipeline_class}. Note"
                         f" that this might lead to problems when using {pipeline_class} and is not recommended."
                     )
                     sub_model_should_be_defined = False
                 else:
-                    logger.warn(
+                    logger.warning(
                         f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it"
                         " has the correct type"
                     )
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index cf2bbb980ed0..3f2857fa4f56 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -405,7 +405,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
         if low_cpu_mem_usage and not is_accelerate_available():
             low_cpu_mem_usage = False
-            logger.warn(
+            logger.warning(
                 "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
                 " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
                 " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
@@ -571,13 +571,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                             f" {expected_class_obj}"
                         )
                 elif passed_class_obj[name] is None:
-                    logger.warn(
+                    logger.warning(
                         f"You have passed `None` for {name} to disable its functionality in {pipeline_class}. Note"
                         f" that this might lead to problems when using {pipeline_class} and is not recommended."
                     )
                     sub_model_should_be_defined = False
                 else:
-                    logger.warn(
+                    logger.warning(
                         f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it"
                         " has the correct type"
                     )
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index afb2c5288640..246f2b8720b6 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -115,7 +115,7 @@ def __init__(
             scheduler._internal_dict = FrozenDict(new_config)
 
         if safety_checker is None:
-            logger.warn(
+            logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Alt Diffusion license and do not expose unfiltered"
                 " results in services or applications open to the public. Both the diffusers team and Hugging Face"
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 9a50ef4a4e9c..7fc1658ea078 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -128,7 +128,7 @@ def __init__(
             scheduler._internal_dict = FrozenDict(new_config)
 
         if safety_checker is None:
-            logger.warn(
+            logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Alt Diffusion license and do not expose unfiltered"
                 " results in services or applications open to the public. Both the diffusers team and Hugging Face"
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index 2b3cf8fa95d5..8d702b1b0279 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -160,7 +160,7 @@ def __init__(
             scheduler._internal_dict = FrozenDict(new_config)
 
         if safety_checker is None:
-            logger.warn(
+            logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
                 " results in services or applications open to the public. Both the diffusers team and Hugging Face"
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
index a2f0f73dbf1f..9c668d5e51d5 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
@@ -88,7 +88,7 @@ def __init__(
         self.dtype = dtype
 
         if safety_checker is None:
-            logger.warn(
+            logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
                 " results in services or applications open to the public. Both the diffusers team and Hugging Face"
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 963d75c58bdd..fbfac6b5a0f4 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -114,7 +114,7 @@ def __init__(
             scheduler._internal_dict = FrozenDict(new_config)
 
         if safety_checker is None:
-            logger.warn(
+            logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
                 " results in services or applications open to the public. Both the diffusers team and Hugging Face"
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index f543d564fe84..7efd39e72656 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -127,7 +127,7 @@ def __init__(
             scheduler._internal_dict = FrozenDict(new_config)
 
         if safety_checker is None:
-            logger.warn(
+            logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
                 " results in services or applications open to the public. Both the diffusers team and Hugging Face"
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 2058d972fb87..9eb8de248293 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -192,7 +192,7 @@ def __init__(
             scheduler._internal_dict = FrozenDict(new_config)
 
         if safety_checker is None:
-            logger.warn(
+            logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
                 " results in services or applications open to the public. Both the diffusers team and Hugging Face"
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 77e903ff6807..003b2668e75a 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -140,7 +140,7 @@ def __init__(
             scheduler._internal_dict = FrozenDict(new_config)
 
         if safety_checker is None:
-            logger.warn(
+            logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
                 " results in services or applications open to the public. Both the diffusers team and Hugging Face"
diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
index 421bfd1c9ff5..cfa71b9242b0 100644
--- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
+++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
@@ -108,7 +108,7 @@ def __init__(
             scheduler._internal_dict = FrozenDict(new_config)
 
         if safety_checker is None:
-            logger.warn(
+            logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
                 " results in services or applications open to the public. Both the diffusers team and Hugging Face"
diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
index f3abf017d9e5..8117f305606a 100644
--- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -189,7 +189,7 @@ def step(
             )
 
         if not self.is_scale_input_called:
-            logger.warn(
+            logger.warning(
                 "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
                 "See `StableDiffusionPipeline` for a usage example."
             )
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index d9991bc3a097..3b2262fcc6d5 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -198,7 +198,7 @@ def step(
             )
 
         if not self.is_scale_input_called:
-            logger.warn(
+            logger.warning(
                 "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
                 "See `StableDiffusionPipeline` for a usage example."
             )

From 8fd3a74322befbd13bb461e4cb9e1a57f6e9ed96 Mon Sep 17 00:00:00 2001
From: Penn <jenkspt@gmail.com>
Date: Wed, 23 Nov 2022 05:11:39 -0500
Subject: [PATCH 232/269] Fix using non-square images with UNet2DModel and
 DDIM/DDPM pipelines (#1289)

* fix non square images with UNet2DModel and DDIM/DDPM pipelines

* fix unet_2d `sample_size` docstring

* update pipeline tests for unet uncond

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 src/diffusers/models/unet_2d.py               |  6 +-
 src/diffusers/models/unet_2d_condition.py     |  3 +-
 src/diffusers/pipelines/ddim/pipeline_ddim.py |  6 +-
 src/diffusers/pipelines/ddpm/pipeline_ddpm.py |  6 +-
 tests/test_pipelines.py                       | 62 +++++++++++++------
 5 files changed, 57 insertions(+), 26 deletions(-)

diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
index 04324057609e..3279830d7d86 100644
--- a/src/diffusers/models/unet_2d.py
+++ b/src/diffusers/models/unet_2d.py
@@ -43,8 +43,8 @@ class UNet2DModel(ModelMixin, ConfigMixin):
     implements for all the model (such as downloading or saving, etc.)
 
     Parameters:
-        sample_size (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, *optional*):
-            Input sample size.
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
         in_channels (`int`, *optional*, defaults to 3): Number of channels in the input image.
         out_channels (`int`, *optional*, defaults to 3): Number of channels in the output.
         center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
@@ -71,7 +71,7 @@ class UNet2DModel(ModelMixin, ConfigMixin):
     @register_to_config
     def __init__(
         self,
-        sample_size: Optional[int] = None,
+        sample_size: Optional[Union[int, Tuple[int, int]]] = None,
         in_channels: int = 3,
         out_channels: int = 3,
         center_input_sample: bool = False,
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index c3f2fb87b60d..b09044b57b7b 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -56,7 +56,8 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin):
     implements for all the models (such as downloading or saving, etc.)
 
     Parameters:
-        sample_size (`int`, *optional*): The size of the input sample.
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
         in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
         out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
         center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
diff --git a/src/diffusers/pipelines/ddim/pipeline_ddim.py b/src/diffusers/pipelines/ddim/pipeline_ddim.py
index 6db6298329a7..b9e590dea646 100644
--- a/src/diffusers/pipelines/ddim/pipeline_ddim.py
+++ b/src/diffusers/pipelines/ddim/pipeline_ddim.py
@@ -89,7 +89,11 @@ def __call__(
             generator = None
 
         # Sample gaussian noise to begin loop
-        image_shape = (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size)
+        if isinstance(self.unet.sample_size, int):
+            image_shape = (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size)
+        else:
+            image_shape = (batch_size, self.unet.in_channels, *self.unet.sample_size)
+
         if self.device.type == "mps":
             # randn does not work reproducibly on mps
             image = torch.randn(image_shape, generator=generator)
diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
index c937a230031e..634e1c0f99f6 100644
--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -94,7 +94,11 @@ def __call__(
             generator = None
 
         # Sample gaussian noise to begin loop
-        image_shape = (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size)
+        if isinstance(self.unet.sample_size, int):
+            image_shape = (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size)
+        else:
+            image_shape = (batch_size, self.unet.in_channels, *self.unet.sample_size)
+
         if self.device.type == "mps":
             # randn does not work reproducibly on mps
             image = torch.randn(image_shape, generator=generator)
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index b593791c9455..19493e323149 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -18,6 +18,7 @@
 import random
 import tempfile
 import unittest
+from functools import partial
 
 import numpy as np
 import torch
@@ -46,6 +47,7 @@
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from diffusers.utils import CONFIG_NAME, WEIGHTS_NAME, floats_tensor, slow, torch_device
 from diffusers.utils.testing_utils import CaptureLogger, get_tests_dir, require_torch_gpu
+from parameterized import parameterized
 from PIL import Image
 from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
@@ -247,7 +249,6 @@ def test_load_pipeline_from_git(self):
 
 
 class PipelineFastTests(unittest.TestCase):
-    @property
     def dummy_image(self):
         batch_size = 1
         num_channels = 3
@@ -256,13 +257,12 @@ def dummy_image(self):
         image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
         return image
 
-    @property
-    def dummy_uncond_unet(self):
+    def dummy_uncond_unet(self, sample_size=32):
         torch.manual_seed(0)
         model = UNet2DModel(
             block_out_channels=(32, 64),
             layers_per_block=2,
-            sample_size=32,
+            sample_size=sample_size,
             in_channels=3,
             out_channels=3,
             down_block_types=("DownBlock2D", "AttnDownBlock2D"),
@@ -270,13 +270,12 @@ def dummy_uncond_unet(self):
         )
         return model
 
-    @property
-    def dummy_cond_unet(self):
+    def dummy_cond_unet(self, sample_size=32):
         torch.manual_seed(0)
         model = UNet2DConditionModel(
             block_out_channels=(32, 64),
             layers_per_block=2,
-            sample_size=32,
+            sample_size=sample_size,
             in_channels=4,
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
@@ -285,13 +284,12 @@ def dummy_cond_unet(self):
         )
         return model
 
-    @property
-    def dummy_cond_unet_inpaint(self):
+    def dummy_cond_unet_inpaint(self, sample_size=32):
         torch.manual_seed(0)
         model = UNet2DConditionModel(
             block_out_channels=(32, 64),
             layers_per_block=2,
-            sample_size=32,
+            sample_size=sample_size,
             in_channels=9,
             out_channels=4,
             down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
@@ -300,7 +298,6 @@ def dummy_cond_unet_inpaint(self):
         )
         return model
 
-    @property
     def dummy_vq_model(self):
         torch.manual_seed(0)
         model = VQModel(
@@ -313,7 +310,6 @@ def dummy_vq_model(self):
         )
         return model
 
-    @property
     def dummy_vae(self):
         torch.manual_seed(0)
         model = AutoencoderKL(
@@ -326,7 +322,6 @@ def dummy_vae(self):
         )
         return model
 
-    @property
     def dummy_text_encoder(self):
         torch.manual_seed(0)
         config = CLIPTextConfig(
@@ -342,7 +337,6 @@ def dummy_text_encoder(self):
         )
         return CLIPTextModel(config)
 
-    @property
     def dummy_extractor(self):
         def extract(*args, **kwargs):
             class Out:
@@ -357,15 +351,43 @@ def to(self, device):
 
         return extract
 
-    def test_components(self):
+    @parameterized.expand(
+        [
+            [DDIMScheduler, DDIMPipeline, 32],
+            [partial(DDPMScheduler, predict_epsilon=True), DDPMPipeline, 32],
+            [DDIMScheduler, DDIMPipeline, (32, 64)],
+            [partial(DDPMScheduler, predict_epsilon=True), DDPMPipeline, (64, 32)],
+        ]
+    )
+    def test_uncond_unet_components(self, scheduler_fn=DDPMScheduler, pipeline_fn=DDPMPipeline, sample_size=32):
+        unet = self.dummy_uncond_unet(sample_size)
+        # DDIM doesn't take `predict_epsilon`, and DDPM requires it -- so using partial in parameterized decorator
+        scheduler = scheduler_fn()
+        pipeline = pipeline_fn(unet, scheduler).to(torch_device)
+
+        # Device type MPS is not supported for torch.Generator() api.
+        if torch_device == "mps":
+            generator = torch.manual_seed(0)
+        else:
+            generator = torch.Generator(device=torch_device).manual_seed(0)
+
+        out_image = pipeline(
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+        ).images
+        sample_size = (sample_size, sample_size) if isinstance(sample_size, int) else sample_size
+        assert out_image.shape == (1, *sample_size, 3)
+
+    def test_stable_diffusion_components(self):
         """Test that components property works correctly"""
-        unet = self.dummy_cond_unet
+        unet = self.dummy_cond_unet()
         scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
+        vae = self.dummy_vae()
+        bert = self.dummy_text_encoder()
         tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
 
-        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        image = self.dummy_image().cpu().permute(0, 2, 3, 1)[0]
         init_image = Image.fromarray(np.uint8(image)).convert("RGB")
         mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
 
@@ -377,7 +399,7 @@ def test_components(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor,
+            feature_extractor=self.dummy_extractor(),
         ).to(torch_device)
         img2img = StableDiffusionImg2ImgPipeline(**inpaint.components).to(torch_device)
         text2img = StableDiffusionPipeline(**inpaint.components).to(torch_device)

From 9e234d8048e5b9f631937f6dcdec364952e4f90e Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Wed, 23 Nov 2022 11:13:34 +0100
Subject: [PATCH 233/269] handle fp16 in `UNet2DModel` (#1216)

* make sure fp16 runs well

* add fp16 test for superes

* Update src/diffusers/models/unet_2d.py

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

* gen on cuda

* always run fast inferecne test on cpu

* run on cpu

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 src/diffusers/models/unet_2d.py               |  9 +++++---
 .../test_latent_diffusion_superresolution.py  | 21 +++++++++++++++++++
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
index 3279830d7d86..6d28f07b18a6 100644
--- a/src/diffusers/models/unet_2d.py
+++ b/src/diffusers/models/unet_2d.py
@@ -209,6 +209,11 @@ def forward(
         timesteps = timesteps * torch.ones(sample.shape[0], dtype=timesteps.dtype, device=timesteps.device)
 
         t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
         emb = self.time_embedding(t_emb)
 
         # 2. pre-process
@@ -242,9 +247,7 @@ def forward(
                 sample = upsample_block(sample, res_samples, emb)
 
         # 6. post-process
-        # make sure hidden states is in float32
-        # when running in half-precision
-        sample = self.conv_norm_out(sample.float()).type(sample.dtype)
+        sample = self.conv_norm_out(sample)
         sample = self.conv_act(sample)
         sample = self.conv_out(sample)
 
diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
index c04210dede31..6f1f51c7ba75 100644
--- a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
+++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
@@ -87,6 +87,27 @@ def test_inference_superresolution(self):
         expected_slice = np.array([0.8678, 0.8245, 0.6381, 0.6830, 0.4385, 0.5599, 0.4641, 0.6201, 0.5150])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
+    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
+    def test_inference_superresolution_fp16(self):
+        unet = self.dummy_uncond_unet
+        scheduler = DDIMScheduler()
+        vqvae = self.dummy_vq_model
+
+        # put models in fp16
+        unet = unet.half()
+        vqvae = vqvae.half()
+
+        ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler)
+        ldm.to(torch_device)
+        ldm.set_progress_bar_config(disable=None)
+
+        init_image = self.dummy_image.to(torch_device)
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = ldm(init_image, generator=generator, num_inference_steps=2, output_type="numpy").images
+
+        assert image.shape == (1, 64, 64, 3)
+
 
 @slow
 @require_torch

From 0eb507f2af991b1f0b6c2ede5b20a994999e85d3 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Wed, 23 Nov 2022 14:36:39 +0100
Subject: [PATCH 234/269] StableDiffusionImageVariationPipeline (#1365)

* add StableDiffusionImageVariationPipeline

* add ini init

* use CLIPVisionModelWithProjection

* fix _encode_image

* add copied from

* fix copies

* add doc

* handle tensor in _encode_image

* add tests

* correct model_id

* remove copied from in enable_sequential_cpu_offload

* fix tests

* make slow tests pass

* update slow tests

* use temp model for now

* fix test_stable_diffusion_img_variation_intermediate_state

* fix test_stable_diffusion_img_variation_intermediate_state

* check for torch.Tensor

* quality

* fix name

* fix slow tests

* install transformers from source

* fix install

* fix install

* Apply suggestions from code review

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

* input_image -> image

* remove deprication warnings

* fix test_stable_diffusion_img_variation_multiple_images

* make flake happy

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 .github/workflows/pr_tests.yml                |   2 +
 .github/workflows/push_tests.yml              |   2 +
 .../source/api/pipelines/stable_diffusion.mdx |   7 +
 src/diffusers/__init__.py                     |   1 +
 src/diffusers/pipelines/__init__.py           |   1 +
 .../pipelines/stable_diffusion/__init__.py    |   1 +
 ...peline_stable_diffusion_image_variation.py | 437 ++++++++++++++++++
 .../dummy_torch_and_transformers_objects.py   |  15 +
 .../test_stable_diffusion_image_variation.py  | 424 +++++++++++++++++
 9 files changed, 890 insertions(+)
 create mode 100644 src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
 create mode 100644 tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py

diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
index dc1c482aa098..55a9bd68deb3 100644
--- a/.github/workflows/pr_tests.yml
+++ b/.github/workflows/pr_tests.yml
@@ -60,6 +60,7 @@ jobs:
       run: |
         python -m pip install -e .[quality,test]
         python -m pip install git+https://github.com/huggingface/accelerate
+        python -m pip install -U git+https://github.com/huggingface/transformers
 
     - name: Environment
       run: |
@@ -127,6 +128,7 @@ jobs:
         ${CONDA_RUN} python -m pip install -e .[quality,test]
         ${CONDA_RUN} python -m pip install --pre torch==${MPS_TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/test/cpu
         ${CONDA_RUN} python -m pip install git+https://github.com/huggingface/accelerate
+        ${CONDA_RUN} python -m pip install -U git+https://github.com/huggingface/transformers
 
     - name: Environment
       shell: arch -arch arm64 bash {0}
diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
index 2beb05e8eaca..4bab00b7eeb0 100644
--- a/.github/workflows/push_tests.yml
+++ b/.github/workflows/push_tests.yml
@@ -62,6 +62,7 @@ jobs:
       run: |
         python -m pip install -e .[quality,test]
         python -m pip install git+https://github.com/huggingface/accelerate
+        python -m pip install -U git+https://github.com/huggingface/transformers
 
     - name: Environment
       run: |
@@ -131,6 +132,7 @@ jobs:
       run: |
         python -m pip install -e .[quality,test,training]
         python -m pip install git+https://github.com/huggingface/accelerate
+        python -m pip install -U git+https://github.com/huggingface/transformers
 
     - name: Environment
       run: |
diff --git a/docs/source/api/pipelines/stable_diffusion.mdx b/docs/source/api/pipelines/stable_diffusion.mdx
index 8b551f7a3b17..9884cbb20772 100644
--- a/docs/source/api/pipelines/stable_diffusion.mdx
+++ b/docs/source/api/pipelines/stable_diffusion.mdx
@@ -88,3 +88,10 @@ If you want to use all possible use cases in a single `DiffusionPipeline` you ca
 	- __call__
 	- enable_attention_slicing
 	- disable_attention_slicing
+
+
+## StableDiffusionImageVariationPipeline
+[[autodoc]] StableDiffusionImageVariationPipeline
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 80936d1f6945..9669382bf69f 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -69,6 +69,7 @@
         AltDiffusionPipeline,
         CycleDiffusionPipeline,
         LDMTextToImagePipeline,
+        StableDiffusionImageVariationPipeline,
         StableDiffusionImg2ImgPipeline,
         StableDiffusionInpaintPipeline,
         StableDiffusionInpaintPipelineLegacy,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index d61dc1316f32..a27cb5d2076a 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -19,6 +19,7 @@
     from .latent_diffusion import LDMTextToImagePipeline
     from .stable_diffusion import (
         CycleDiffusionPipeline,
+        StableDiffusionImageVariationPipeline,
         StableDiffusionImg2ImgPipeline,
         StableDiffusionInpaintPipeline,
         StableDiffusionInpaintPipelineLegacy,
diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
index fe813b07cc5b..0b2fa15d7686 100644
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -30,6 +30,7 @@ class StableDiffusionPipelineOutput(BaseOutput):
 if is_transformers_available() and is_torch_available():
     from .pipeline_cycle_diffusion import CycleDiffusionPipeline
     from .pipeline_stable_diffusion import StableDiffusionPipeline
+    from .pipeline_stable_diffusion_image_variation import StableDiffusionImageVariationPipeline
     from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
     from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
     from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
new file mode 100644
index 000000000000..4cfa5817af03
--- /dev/null
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -0,0 +1,437 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import torch
+
+import PIL
+from diffusers.utils import is_accelerate_available
+from transformers import CLIPFeatureExtractor, CLIPVisionModelWithProjection
+
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...pipeline_utils import DiffusionPipeline
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ...utils import logging
+from . import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class StableDiffusionImageVariationPipeline(DiffusionPipeline):
+    r"""
+    Pipeline to generate variations from an input image using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder. Stable Diffusion Image Variation uses the vision portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModelWithProjection),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNet2DConditionModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+
+        if safety_checker is None:
+            logger.warn(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention
+    def enable_xformers_memory_efficient_attention(self):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(False)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_attention_slicing
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        for cpu_offloaded_model in [self.unet, self.image_encoder, self.vae, self.safety_checker]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free_guidance):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(images=image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            uncond_embeddings = torch.zeros_like(image_embeddings)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([uncond_embeddings, image_embeddings])
+
+        return image_embeddings
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(self, image, height, width, callback_steps):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                f"`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `list` but is {type(image)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // 8, width // 8)
+        if latents is None:
+            if device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+            else:
+                latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                The image or images to guide the image generation. If you provide a tensor, it needs to comply with the
+                configuration of
+                [this](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json)
+                `CLIPFeatureExtractor`
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, height, width, callback_steps)
+
+        # 2. Define call parameters
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, list):
+            batch_size = len(image)
+        else:
+            batch_size = image.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input image
+        image_embeddings = self._encode_image(image, device, num_images_per_prompt, do_classifier_free_guidance)
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        # 8. Post-processing
+        image = self.decode_latents(latents)
+
+        # 9. Run safety checker
+        image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
+
+        # 10. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index ddb93f5d5fa5..c184b6295de7 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -64,6 +64,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class StableDiffusionImageVariationPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class StableDiffusionImg2ImgPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
new file mode 100644
index 000000000000..2935275d0f91
--- /dev/null
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
@@ -0,0 +1,424 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import (
+    AutoencoderKL,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionImageVariationPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu
+from transformers import CLIPVisionConfig, CLIPVisionModelWithProjection
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @property
+    def dummy_image(self):
+        batch_size = 1
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
+        return image
+
+    @property
+    def dummy_cond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        return model
+
+    @property
+    def dummy_vae(self):
+        torch.manual_seed(0)
+        model = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        return model
+
+    @property
+    def dummy_image_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPVisionConfig(
+            hidden_size=32,
+            projection_dim=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            image_size=32,
+            patch_size=4,
+        )
+        return CLIPVisionModelWithProjection(config)
+
+    @property
+    def dummy_extractor(self):
+        def extract(*args, **kwargs):
+            class Out:
+                def __init__(self):
+                    self.pixel_values = torch.ones([0])
+
+                def to(self, device):
+                    self.pixel_values.to(device)
+                    return self
+
+            return Out()
+
+        return extract
+
+    def test_stable_diffusion_img_variation_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        image_encoder = self.dummy_image_encoder
+
+        init_image = self.dummy_image.to(device)
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionImageVariationPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            image_encoder=image_encoder,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe(
+            init_image,
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+        )
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            init_image,
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        print(image_slice.flatten())
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array([0.4935, 0.4784, 0.4802, 0.5027, 0.4805, 0.5149, 0.5143, 0.4879, 0.4731])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_stable_diffusion_img_variation_multiple_images(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        image_encoder = self.dummy_image_encoder
+
+        init_image = self.dummy_image.to(device).repeat(2, 1, 1, 1)
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionImageVariationPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            image_encoder=image_encoder,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe(
+            init_image,
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+        )
+
+        image = output.images
+
+        image_slice = image[-1, -3:, -3:, -1]
+
+        assert image.shape == (2, 128, 128, 3)
+        expected_slice = np.array([0.4939, 0.4627, 0.4831, 0.5710, 0.5387, 0.4428, 0.5230, 0.5545, 0.4586])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_stable_diffusion_img_variation_num_images_per_prompt(self):
+        device = "cpu"
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        image_encoder = self.dummy_image_encoder
+
+        init_image = self.dummy_image.to(device)
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionImageVariationPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            image_encoder=image_encoder,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        # test num_images_per_prompt=1 (default)
+        images = sd_pipe(
+            init_image,
+            num_inference_steps=2,
+            output_type="np",
+        ).images
+
+        assert images.shape == (1, 128, 128, 3)
+
+        # test num_images_per_prompt=1 (default) for batch of images
+        batch_size = 2
+        images = sd_pipe(
+            init_image.repeat(batch_size, 1, 1, 1),
+            num_inference_steps=2,
+            output_type="np",
+        ).images
+
+        assert images.shape == (batch_size, 128, 128, 3)
+
+        # test num_images_per_prompt for single prompt
+        num_images_per_prompt = 2
+        images = sd_pipe(
+            init_image,
+            num_inference_steps=2,
+            output_type="np",
+            num_images_per_prompt=num_images_per_prompt,
+        ).images
+
+        assert images.shape == (num_images_per_prompt, 128, 128, 3)
+
+        # test num_images_per_prompt for batch of prompts
+        batch_size = 2
+        images = sd_pipe(
+            init_image.repeat(batch_size, 1, 1, 1),
+            num_inference_steps=2,
+            output_type="np",
+            num_images_per_prompt=num_images_per_prompt,
+        ).images
+
+        assert images.shape == (batch_size * num_images_per_prompt, 128, 128, 3)
+
+    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
+    def test_stable_diffusion_img_variation_fp16(self):
+        """Test that stable diffusion img2img works with fp16"""
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        image_encoder = self.dummy_image_encoder
+
+        init_image = self.dummy_image.to(torch_device).float()
+
+        # put models in fp16
+        unet = unet.half()
+        vae = vae.half()
+        image_encoder = image_encoder.half()
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionImageVariationPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            image_encoder=image_encoder,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = sd_pipe(
+            init_image,
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+        ).images
+
+        assert image.shape == (1, 128, 128, 3)
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionImageVariationPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_diffusion_img_variation_pipeline_default(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/vermeer.jpg"
+        )
+        init_image = init_image.resize((512, 512))
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/vermeer.npy"
+        )
+
+        model_id = "fusing/sd-image-variations-diffusers"
+        pipe = StableDiffusionImageVariationPipeline.from_pretrained(
+            model_id,
+            safety_checker=None,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(
+            init_image,
+            guidance_scale=7.5,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        # img2img is flaky across GPUs even in fp32, so using MAE here
+        assert np.abs(expected_image - image).max() < 1e-3
+
+    def test_stable_diffusion_img_variation_intermediate_state(self):
+        number_of_steps = 0
+
+        def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
+            test_callback_fn.has_been_called = True
+            nonlocal number_of_steps
+            number_of_steps += 1
+            if step == 0:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array([1.83, 1.293, -0.09705, 1.256, -2.293, 1.091, -0.0809, -0.65, -2.953])
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
+            elif step == 37:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array([2.285, 2.703, 1.969, 0.696, -1.323, 0.9253, -0.5464, -1.521, -2.537])
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-2
+
+        test_callback_fn.has_been_called = False
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        )
+        init_image = init_image.resize((512, 512))
+
+        pipe = StableDiffusionImageVariationPipeline.from_pretrained(
+            "fusing/sd-image-variations-diffusers",
+            torch_dtype=torch.float16,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        with torch.autocast(torch_device):
+            pipe(
+                init_image,
+                num_inference_steps=50,
+                guidance_scale=7.5,
+                generator=generator,
+                callback=test_callback_fn,
+                callback_steps=1,
+            )
+        assert test_callback_fn.has_been_called
+        assert number_of_steps == 51
+
+    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/img2img/sketch-mountains-input.jpg"
+        )
+        init_image = init_image.resize((512, 512))
+
+        model_id = "fusing/sd-image-variations-diffusers"
+        lms = LMSDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
+        pipe = StableDiffusionImageVariationPipeline.from_pretrained(
+            model_id, scheduler=lms, safety_checker=None, torch_dtype=torch.float16
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing(1)
+        pipe.enable_sequential_cpu_offload()
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        _ = pipe(
+            init_image,
+            guidance_scale=7.5,
+            generator=generator,
+            output_type="np",
+            num_inference_steps=5,
+        )
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 2.6 GB is allocated
+        assert mem_bytes < 2.6 * 10**9

From 2625fb59dc7a3f03516f0c6c5c0cdda18ef0ef5b Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 23 Nov 2022 19:03:45 +0100
Subject: [PATCH 235/269] [Versatile Diffusion] Add versatile diffusion model
 (#1283)

* up

* convert dual unet

* revert dual attn

* adapt for vd-official

* test the full pipeline

* mixed inference

* mixed inference for text2img

* add image prompting

* fix clip norm

* split text2img and img2img

* fix format

* refactor text2img

* mega pipeline

* add optimus

* refactor image var

* wip text_unet

* text unet end to end

* update tests

* reshape

* fix image to text

* add some first docs

* dual guided pipeline

* fix token ratio

* propose change

* dual transformer as a native module

* DualTransformer(nn.Module)

* DualTransformer(nn.Module)

* correct unconditional image

* save-load with mega pipeline

* remove image to text

* up

* uP

* fix

* up

* final fix

* remove_unused_weights

* test updates

* save progress

* uP

* fix dual prompts

* some fixes

* finish

* style

* finish renaming

* up

* fix

* fix

* fix

* finish

Co-authored-by: anton-l <anton@huggingface.co>
---
 docs/source/_toctree.yml                      |    2 +
 docs/source/api/pipelines/overview.mdx        |    3 +
 .../api/pipelines/versatile_diffusion.mdx     |   73 ++
 docs/source/index.mdx                         |    3 +
 ...onvert_versatile_diffusion_to_diffusers.py |  791 ++++++++++++
 src/diffusers/__init__.py                     |    4 +
 src/diffusers/models/attention.py             |  126 ++
 src/diffusers/models/unet_2d.py               |    2 +-
 src/diffusers/models/unet_2d_blocks.py        |   99 +-
 src/diffusers/models/unet_2d_condition.py     |    6 +-
 src/diffusers/pipelines/__init__.py           |    6 +
 .../pipelines/versatile_diffusion/__init__.py |    9 +
 .../versatile_diffusion/modeling_text_unet.py | 1082 +++++++++++++++++
 .../pipeline_versatile_diffusion.py           |  462 +++++++
 ...ipeline_versatile_diffusion_dual_guided.py |  628 ++++++++++
 ...ine_versatile_diffusion_image_variation.py |  462 +++++++
 ...eline_versatile_diffusion_text_to_image.py |  514 ++++++++
 .../dummy_torch_and_transformers_objects.py   |   60 +
 .../pipelines/versatile_diffusion/__init__.py |    0
 .../test_versatile_diffusion_dual_guided.py   |  112 ++
 ...est_versatile_diffusion_image_variation.py |   58 +
 .../test_versatile_diffusion_mega.py          |  129 ++
 .../test_versatile_diffusion_text_to_image.py |   86 ++
 23 files changed, 4687 insertions(+), 30 deletions(-)
 create mode 100644 docs/source/api/pipelines/versatile_diffusion.mdx
 create mode 100644 scripts/convert_versatile_diffusion_to_diffusers.py
 create mode 100644 src/diffusers/pipelines/versatile_diffusion/__init__.py
 create mode 100644 src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
 create mode 100644 src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
 create mode 100644 src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
 create mode 100644 src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
 create mode 100644 src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
 create mode 100644 tests/pipelines/versatile_diffusion/__init__.py
 create mode 100644 tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py
 create mode 100644 tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
 create mode 100644 tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
 create mode 100644 tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 13687dc5419e..bf23d363a863 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -112,6 +112,8 @@
       title: "Stochastic Karras VE"
     - local: api/pipelines/dance_diffusion
       title: "Dance Diffusion"
+    - local: api/pipelines/versatile_diffusion
+      title: "Versatile Diffusion"
     - local: api/pipelines/vq_diffusion
       title: "VQ Diffusion"
     - local: api/pipelines/repaint
diff --git a/docs/source/api/pipelines/overview.mdx b/docs/source/api/pipelines/overview.mdx
index ff83bad05525..c43f09d66dde 100644
--- a/docs/source/api/pipelines/overview.mdx
+++ b/docs/source/api/pipelines/overview.mdx
@@ -60,6 +60,9 @@ available a colab notebook to directly try them out.
 | [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
 | [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
 | [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation |
+| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation | 
+| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation | 
+| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation | 
 | [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation | 
 
 
diff --git a/docs/source/api/pipelines/versatile_diffusion.mdx b/docs/source/api/pipelines/versatile_diffusion.mdx
new file mode 100644
index 000000000000..f557c5b0aac8
--- /dev/null
+++ b/docs/source/api/pipelines/versatile_diffusion.mdx
@@ -0,0 +1,73 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# VersatileDiffusion
+
+VersatileDiffusion was proposed in [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) by Xingqian Xu, Zhangyang Wang, Eric Zhang, Kai Wang, Humphrey Shi .
+
+The abstract of the paper is the following:
+
+*The recent advances in diffusion models have set an impressive milestone in many generation tasks. Trending works such as DALL-E2, Imagen, and Stable Diffusion have attracted great interest in academia and industry. Despite the rapid landscape changes, recent new approaches focus on extensions and performance rather than capacity, thus requiring separate models for separate tasks. In this work, we expand the existing single-flow diffusion pipeline into a multi-flow network, dubbed Versatile Diffusion (VD), that handles text-to-image, image-to-text, image-variation, and text-variation in one unified model. Moreover, we generalize VD to a unified multi-flow multimodal diffusion framework with grouped layers, swappable streams, and other propositions that can process modalities beyond images and text. Through our experiments, we demonstrate that VD and its underlying framework have the following merits: a) VD handles all subtasks with competitive quality; b) VD initiates novel extensions and applications such as disentanglement of style and semantic, image-text dual-guided generation, etc.; c) Through these experiments and applications, VD provides more semantic insights of the generated outputs.*
+
+## Tips
+
+- VersatileDiffusion is conceptually very similar as [Stable Diffusion](./api/pipelines/stable_diffusion),  but instead of providing just a image data stream conditioned on text, VersatileDiffusion provides both a image and text data stream and can be conditioned on both text and image.
+
+### *Run VersatileDiffusion*
+
+You can both load the memory intensive "all-in-one" [`VersatileDiffusionPipeline`] that can run all tasks 
+with the same class as shown in [`VersatileDiffusionPipeline.text_to_image`], [`VersatileDiffusionPipeline.image_variation`], and [`VersatileDiffusionPipeline.dual_guided`]
+
+**or**
+
+You can run the individual pipelines which are much more memory efficient:
+
+- *Text-to-Image*: [`VersatileDiffusionTextToImagePipeline.__call__`]
+- *Image Variation*: [`VersatileDiffusionImageVariationPipeline.__call__`]
+- *Dual Text and Image Guided Generation*: [`VersatileDiffusionDualGuidedPipeline.__call__`]
+
+### *How to load and use different schedulers.*
+
+The versatile diffusion pipelines uses [`DDIMScheduler`] scheduler by default. But `diffusers` provides many other schedulers that can be used with the alt diffusion pipeline such as [`PNDMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`] etc.
+To use a different scheduler, you can either change it via the [`ConfigMixin.from_config`] method or pass the `scheduler` argument to the `from_pretrained` method of the pipeline. For example, to use the [`EulerDiscreteScheduler`], you can do the following:
+
+```python
+>>> from diffusers import VersatileDiffusionPipeline, EulerDiscreteScheduler
+
+>>> pipeline = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion")
+>>> pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
+
+>>> # or
+>>> euler_scheduler = EulerDiscreteScheduler.from_pretrained("shi-labs/versatile-diffusion", subfolder="scheduler")
+>>> pipeline = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion", scheduler=euler_scheduler)
+```
+
+## VersatileDiffusionPipeline
+[[autodoc]] VersatileDiffusionPipeline
+
+## VersatileDiffusionTextToImagePipeline
+[[autodoc]] VersatileDiffusionTextToImagePipeline
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+
+## VersatileDiffusionImageVariationPipeline
+[[autodoc]] VersatileDiffusionImageVariationPipeline
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
+
+## VersatileDiffusionDualGuidedPipeline
+[[autodoc]] VersatileDiffusionDualGuidedPipeline
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 1c5ecc5fe3cb..09cc59fda99c 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -50,6 +50,9 @@ available a colab notebook to directly try them out.
 | [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
 | [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
 | [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation | 
+| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation | 
+| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation | 
+| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation | 
 | [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation | 
 
 **Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers. 
diff --git a/scripts/convert_versatile_diffusion_to_diffusers.py b/scripts/convert_versatile_diffusion_to_diffusers.py
new file mode 100644
index 000000000000..86fb0e7b4c97
--- /dev/null
+++ b/scripts/convert_versatile_diffusion_to_diffusers.py
@@ -0,0 +1,791 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the Versatile Stable Diffusion checkpoints. """
+
+import argparse
+from argparse import Namespace
+
+import torch
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UNet2DConditionModel,
+    VersatileDiffusionPipeline,
+)
+from diffusers.pipelines.versatile_diffusion.modeling_text_unet import UNetFlatConditionModel
+from transformers import (
+    CLIPFeatureExtractor,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+
+SCHEDULER_CONFIG = Namespace(
+    **{
+        "beta_linear_start": 0.00085,
+        "beta_linear_end": 0.012,
+        "timesteps": 1000,
+        "scale_factor": 0.18215,
+    }
+)
+
+IMAGE_UNET_CONFIG = Namespace(
+    **{
+        "input_channels": 4,
+        "model_channels": 320,
+        "output_channels": 4,
+        "num_noattn_blocks": [2, 2, 2, 2],
+        "channel_mult": [1, 2, 4, 4],
+        "with_attn": [True, True, True, False],
+        "num_heads": 8,
+        "context_dim": 768,
+        "use_checkpoint": True,
+    }
+)
+
+TEXT_UNET_CONFIG = Namespace(
+    **{
+        "input_channels": 768,
+        "model_channels": 320,
+        "output_channels": 768,
+        "num_noattn_blocks": [2, 2, 2, 2],
+        "channel_mult": [1, 2, 4, 4],
+        "second_dim": [4, 4, 4, 4],
+        "with_attn": [True, True, True, False],
+        "num_heads": 8,
+        "context_dim": 768,
+        "use_checkpoint": True,
+    }
+)
+
+AUTOENCODER_CONFIG = Namespace(
+    **{
+        "double_z": True,
+        "z_channels": 4,
+        "resolution": 256,
+        "in_channels": 3,
+        "out_ch": 3,
+        "ch": 128,
+        "ch_mult": [1, 2, 4, 4],
+        "num_res_blocks": 2,
+        "attn_resolutions": [],
+        "dropout": 0.0,
+    }
+)
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
+
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
+
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
+
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming
+    to them. It splits attention layers, and takes into account additional replacements
+    that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        elif path["old"] in old_checkpoint:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+def create_image_unet_diffusers_config(unet_params):
+    """
+    Creates a config for the diffusers based on the config of the VD model.
+    """
+
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if unet_params.with_attn[i] else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if unet_params.with_attn[-i - 1] else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    if not all(n == unet_params.num_noattn_blocks[0] for n in unet_params.num_noattn_blocks):
+        raise ValueError("Not all num_res_blocks are equal, which is not supported in this script.")
+
+    config = dict(
+        sample_size=None,
+        in_channels=unet_params.input_channels,
+        out_channels=unet_params.output_channels,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        layers_per_block=unet_params.num_noattn_blocks[0],
+        cross_attention_dim=unet_params.context_dim,
+        attention_head_dim=unet_params.num_heads,
+    )
+
+    return config
+
+
+def create_text_unet_diffusers_config(unet_params):
+    """
+    Creates a config for the diffusers based on the config of the VD model.
+    """
+
+    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlockFlat" if unet_params.with_attn[i] else "DownBlockFlat"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlockFlat" if unet_params.with_attn[-i - 1] else "UpBlockFlat"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    if not all(n == unet_params.num_noattn_blocks[0] for n in unet_params.num_noattn_blocks):
+        raise ValueError("Not all num_res_blocks are equal, which is not supported in this script.")
+
+    config = dict(
+        sample_size=None,
+        in_channels=(unet_params.input_channels, 1, 1),
+        out_channels=(unet_params.output_channels, 1, 1),
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        layers_per_block=unet_params.num_noattn_blocks[0],
+        cross_attention_dim=unet_params.context_dim,
+        attention_head_dim=unet_params.num_heads,
+    )
+
+    return config
+
+
+def create_vae_diffusers_config(vae_params):
+    """
+    Creates a config for the diffusers based on the config of the VD model.
+    """
+
+    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    config = dict(
+        sample_size=vae_params.resolution,
+        in_channels=vae_params.in_channels,
+        out_channels=vae_params.out_ch,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        latent_channels=vae_params.z_channels,
+        layers_per_block=vae_params.num_res_blocks,
+    )
+    return config
+
+
+def create_diffusers_scheduler(original_config):
+    schedular = DDIMScheduler(
+        num_train_timesteps=original_config.model.params.timesteps,
+        beta_start=original_config.model.params.linear_start,
+        beta_end=original_config.model.params.linear_end,
+        beta_schedule="scaled_linear",
+    )
+    return schedular
+
+
+def convert_vd_unet_checkpoint(checkpoint, config, unet_key, extract_ema=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100:
+        print("Checkpoint has both EMA and non-EMA weights.")
+        if extract_ema:
+            print(
+                "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+                " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+            )
+            for key in keys:
+                if key.startswith("model.diffusion_model"):
+                    flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+        else:
+            print(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+
+    for key in keys:
+        if key.startswith(unet_key):
+            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = checkpoint["model.diffusion_model.time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = checkpoint["model.diffusion_model.time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = checkpoint["model.diffusion_model.time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = checkpoint["model.diffusion_model.time_embed.2.bias"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+        elif f"input_blocks.{i}.0.weight" in unet_state_dict:
+            # text_unet uses linear layers in place of downsamplers
+            shape = unet_state_dict[f"input_blocks.{i}.0.weight"].shape
+            if shape[0] != shape[1]:
+                continue
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            if ["conv.weight", "conv.bias"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+            elif f"output_blocks.{i}.1.weight" in unet_state_dict:
+                # text_unet uses linear layers in place of upsamplers
+                shape = unet_state_dict[f"output_blocks.{i}.1.weight"].shape
+                if shape[0] != shape[1]:
+                    continue
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.weight"] = unet_state_dict.pop(
+                    f"output_blocks.{i}.1.weight"
+                )
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.bias"] = unet_state_dict.pop(
+                    f"output_blocks.{i}.1.bias"
+                )
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+            elif f"output_blocks.{i}.2.weight" in unet_state_dict:
+                # text_unet uses linear layers in place of upsamplers
+                shape = unet_state_dict[f"output_blocks.{i}.2.weight"].shape
+                if shape[0] != shape[1]:
+                    continue
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.weight"] = unet_state_dict.pop(
+                    f"output_blocks.{i}.2.weight"
+                )
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.bias"] = unet_state_dict.pop(
+                    f"output_blocks.{i}.2.bias"
+                )
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+def convert_vd_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    keys = list(checkpoint.keys())
+    for key in keys:
+        vae_state_dict[key] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--unet_checkpoint_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--vae_checkpoint_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--optimus_checkpoint_path", default=None, type=str, required=False, help="Path to the checkpoint to convert."
+    )
+    parser.add_argument(
+        "--scheduler_type",
+        default="pndm",
+        type=str,
+        help="Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler', 'euler-ancest', 'dpm']",
+    )
+    parser.add_argument(
+        "--extract_ema",
+        action="store_true",
+        help=(
+            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
+            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
+            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
+        ),
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+
+    args = parser.parse_args()
+
+    scheduler_config = SCHEDULER_CONFIG
+
+    num_train_timesteps = scheduler_config.timesteps
+    beta_start = scheduler_config.beta_linear_start
+    beta_end = scheduler_config.beta_linear_end
+    if args.scheduler_type == "pndm":
+        scheduler = PNDMScheduler(
+            beta_end=beta_end,
+            beta_schedule="scaled_linear",
+            beta_start=beta_start,
+            num_train_timesteps=num_train_timesteps,
+            skip_prk_steps=True,
+            steps_offset=1,
+        )
+    elif args.scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
+    elif args.scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear")
+    elif args.scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler(
+            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
+        )
+    elif args.scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler(
+            beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear"
+        )
+    elif args.scheduler_type == "ddim":
+        scheduler = DDIMScheduler(
+            beta_start=beta_start,
+            beta_end=beta_end,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+            steps_offset=1,
+        )
+    else:
+        raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!")
+
+    # Convert the UNet2DConditionModel models.
+    if args.unet_checkpoint_path is not None:
+        # image UNet
+        image_unet_config = create_image_unet_diffusers_config(IMAGE_UNET_CONFIG)
+        checkpoint = torch.load(args.unet_checkpoint_path)
+        converted_image_unet_checkpoint = convert_vd_unet_checkpoint(
+            checkpoint, image_unet_config, unet_key="model.diffusion_model.unet_image.", extract_ema=args.extract_ema
+        )
+        image_unet = UNet2DConditionModel(**image_unet_config)
+        image_unet.load_state_dict(converted_image_unet_checkpoint)
+
+        # text UNet
+        text_unet_config = create_text_unet_diffusers_config(TEXT_UNET_CONFIG)
+        converted_text_unet_checkpoint = convert_vd_unet_checkpoint(
+            checkpoint, text_unet_config, unet_key="model.diffusion_model.unet_text.", extract_ema=args.extract_ema
+        )
+        text_unet = UNetFlatConditionModel(**text_unet_config)
+        text_unet.load_state_dict(converted_text_unet_checkpoint)
+
+    # Convert the VAE model.
+    if args.vae_checkpoint_path is not None:
+        vae_config = create_vae_diffusers_config(AUTOENCODER_CONFIG)
+        checkpoint = torch.load(args.vae_checkpoint_path)
+        converted_vae_checkpoint = convert_vd_vae_checkpoint(checkpoint, vae_config)
+
+        vae = AutoencoderKL(**vae_config)
+        vae.load_state_dict(converted_vae_checkpoint)
+
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    image_feature_extractor = CLIPFeatureExtractor.from_pretrained("openai/clip-vit-large-patch14")
+    text_encoder = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+    image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+
+    pipe = VersatileDiffusionPipeline(
+        scheduler=scheduler,
+        tokenizer=tokenizer,
+        image_feature_extractor=image_feature_extractor,
+        text_encoder=text_encoder,
+        image_encoder=image_encoder,
+        image_unet=image_unet,
+        text_unet=text_unet,
+        vae=vae,
+    )
+    pipe.save_pretrained(args.dump_path)
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 9669382bf69f..c4052be34e1e 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -75,6 +75,10 @@
         StableDiffusionInpaintPipelineLegacy,
         StableDiffusionPipeline,
         StableDiffusionPipelineSafe,
+        VersatileDiffusionDualGuidedPipeline,
+        VersatileDiffusionImageVariationPipeline,
+        VersatileDiffusionPipeline,
+        VersatileDiffusionTextToImagePipeline,
         VQDiffusionPipeline,
     )
 else:
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 7e11bde27347..6b2bd5205bdb 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -699,3 +699,129 @@ def forward(self, x, timestep):
         scale, shift = torch.chunk(emb, 2)
         x = self.norm(x) * (1 + scale) + shift
         return x
+
+
+class DualTransformer2DModel(nn.Module):
+    """
+    Dual transformer wrapper that combines two `Transformer2DModel`s for mixed inference.
+
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            Pass if the input is continuous. The number of channels in the input and output.
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.1): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of context dimensions to use.
+        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
+            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
+            `ImagePositionalEmbeddings`.
+        num_vector_embeds (`int`, *optional*):
+            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
+            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
+            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
+            up to but not more than steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the TransformerBlocks' attention should contain a bias parameter.
+    """
+
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+    ):
+        super().__init__()
+        self.transformers = nn.ModuleList(
+            [
+                Transformer2DModel(
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    in_channels=in_channels,
+                    num_layers=num_layers,
+                    dropout=dropout,
+                    norm_num_groups=norm_num_groups,
+                    cross_attention_dim=cross_attention_dim,
+                    attention_bias=attention_bias,
+                    sample_size=sample_size,
+                    num_vector_embeds=num_vector_embeds,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                )
+                for _ in range(2)
+            ]
+        )
+
+        # Variables that can be set by a pipeline:
+
+        # The ratio of transformer1 to transformer2's output states to be combined during inference
+        self.mix_ratio = 0.5
+
+        # The shape of `encoder_hidden_states` is expected to be
+        # `(batch_size, condition_lengths[0]+condition_lengths[1], num_features)`
+        self.condition_lengths = [77, 257]
+
+        # Which transformer to use to encode which condition.
+        # E.g. `(1, 0)` means that we'll use `transformers[1](conditions[0])` and `transformers[0](conditions[1])`
+        self.transformer_index_for_condition = [1, 0]
+
+    def forward(self, hidden_states, encoder_hidden_states, timestep=None, return_dict: bool = True):
+        """
+        Args:
+            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
+                When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
+                hidden_states
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, context dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.long`, *optional*):
+                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.attention.Transformer2DModelOutput`] or `tuple`: [`~models.attention.Transformer2DModelOutput`]
+            if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample
+            tensor.
+        """
+        input_states = hidden_states
+
+        encoded_states = []
+        tokens_start = 0
+        for i in range(2):
+            # for each of the two transformers, pass the corresponding condition tokens
+            condition_state = encoder_hidden_states[:, tokens_start : tokens_start + self.condition_lengths[i]]
+            transformer_index = self.transformer_index_for_condition[i]
+            encoded_state = self.transformers[transformer_index](input_states, condition_state, timestep, return_dict)[
+                0
+            ]
+            encoded_states.append(encoded_state - input_states)
+            tokens_start += self.condition_lengths[i]
+
+        output_states = encoded_states[0] * self.mix_ratio + encoded_states[1] * (1 - self.mix_ratio)
+        output_states = output_states + input_states
+
+        if not return_dict:
+            return (output_states,)
+
+        return Transformer2DModelOutput(sample=output_states)
+
+    def _set_attention_slice(self, slice_size):
+        for transformer in self.transformers:
+            transformer._set_attention_slice(slice_size)
+
+    def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for transformer in self.transformers:
+            transformer._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
index 6d28f07b18a6..5b337f482cc5 100644
--- a/src/diffusers/models/unet_2d.py
+++ b/src/diffusers/models/unet_2d.py
@@ -175,7 +175,7 @@ def __init__(
         num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32)
         self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=num_groups_out, eps=norm_eps)
         self.conv_act = nn.SiLU()
-        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=3, padding=1)
 
     def forward(
         self,
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index 770043f053b2..4dd15845e04f 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -15,7 +15,7 @@
 import torch
 from torch import nn
 
-from .attention import AttentionBlock, Transformer2DModel
+from .attention import AttentionBlock, DualTransformer2DModel, Transformer2DModel
 from .resnet import Downsample2D, FirDownsample2D, FirUpsample2D, ResnetBlock2D, Upsample2D
 
 
@@ -32,6 +32,7 @@ def get_down_block(
     resnet_groups=None,
     cross_attention_dim=None,
     downsample_padding=None,
+    dual_cross_attention=False,
 ):
     down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
     if down_block_type == "DownBlock2D":
@@ -74,6 +75,7 @@ def get_down_block(
             downsample_padding=downsample_padding,
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
         )
     elif down_block_type == "SkipDownBlock2D":
         return SkipDownBlock2D(
@@ -137,6 +139,7 @@ def get_up_block(
     attn_num_head_channels,
     resnet_groups=None,
     cross_attention_dim=None,
+    dual_cross_attention=False,
 ):
     up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
     if up_block_type == "UpBlock2D":
@@ -166,6 +169,7 @@ def get_up_block(
             resnet_groups=resnet_groups,
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
         )
     elif up_block_type == "AttnUpBlock2D":
         return AttnUpBlock2D(
@@ -322,6 +326,7 @@ def __init__(
         attention_type="default",
         output_scale_factor=1.0,
         cross_attention_dim=1280,
+        dual_cross_attention=False,
         **kwargs,
     ):
         super().__init__()
@@ -348,16 +353,28 @@ def __init__(
         attentions = []
 
         for _ in range(num_layers):
-            attentions.append(
-                Transformer2DModel(
-                    attn_num_head_channels,
-                    in_channels // attn_num_head_channels,
-                    in_channels=in_channels,
-                    num_layers=1,
-                    cross_attention_dim=cross_attention_dim,
-                    norm_num_groups=resnet_groups,
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        attn_num_head_channels,
+                        in_channels // attn_num_head_channels,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        attn_num_head_channels,
+                        in_channels // attn_num_head_channels,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
                 )
-            )
             resnets.append(
                 ResnetBlock2D(
                     in_channels=in_channels,
@@ -505,6 +522,7 @@ def __init__(
         output_scale_factor=1.0,
         downsample_padding=1,
         add_downsample=True,
+        dual_cross_attention=False,
     ):
         super().__init__()
         resnets = []
@@ -529,16 +547,28 @@ def __init__(
                     pre_norm=resnet_pre_norm,
                 )
             )
-            attentions.append(
-                Transformer2DModel(
-                    attn_num_head_channels,
-                    out_channels // attn_num_head_channels,
-                    in_channels=out_channels,
-                    num_layers=1,
-                    cross_attention_dim=cross_attention_dim,
-                    norm_num_groups=resnet_groups,
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
                 )
-            )
         self.attentions = nn.ModuleList(attentions)
         self.resnets = nn.ModuleList(resnets)
 
@@ -1089,6 +1119,7 @@ def __init__(
         attention_type="default",
         output_scale_factor=1.0,
         add_upsample=True,
+        dual_cross_attention=False,
     ):
         super().__init__()
         resnets = []
@@ -1115,16 +1146,28 @@ def __init__(
                     pre_norm=resnet_pre_norm,
                 )
             )
-            attentions.append(
-                Transformer2DModel(
-                    attn_num_head_channels,
-                    out_channels // attn_num_head_channels,
-                    in_channels=out_channels,
-                    num_layers=1,
-                    cross_attention_dim=cross_attention_dim,
-                    norm_num_groups=resnet_groups,
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
                 )
-            )
         self.attentions = nn.ModuleList(attentions)
         self.resnets = nn.ModuleList(resnets)
 
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index b09044b57b7b..4eaed803ceee 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -107,6 +107,7 @@ def __init__(
         norm_eps: float = 1e-5,
         cross_attention_dim: int = 1280,
         attention_head_dim: int = 8,
+        dual_cross_attention: bool = False,
     ):
         super().__init__()
 
@@ -146,6 +147,7 @@ def __init__(
                 cross_attention_dim=cross_attention_dim,
                 attn_num_head_channels=attention_head_dim,
                 downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
             )
             self.down_blocks.append(down_block)
 
@@ -160,6 +162,7 @@ def __init__(
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attention_head_dim,
             resnet_groups=norm_num_groups,
+            dual_cross_attention=dual_cross_attention,
         )
 
         # count how many layers upsample the images
@@ -195,6 +198,7 @@ def __init__(
                 resnet_groups=norm_num_groups,
                 cross_attention_dim=cross_attention_dim,
                 attn_num_head_channels=attention_head_dim,
+                dual_cross_attention=dual_cross_attention,
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
@@ -202,7 +206,7 @@ def __init__(
         # out
         self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps)
         self.conv_act = nn.SiLU()
-        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=3, padding=1)
 
     def set_attention_slice(self, slice_size):
         if slice_size is not None and self.config.attention_head_dim % slice_size != 0:
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index a27cb5d2076a..9f4cef4b73e6 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -26,6 +26,12 @@
         StableDiffusionPipeline,
     )
     from .stable_diffusion_safe import StableDiffusionPipelineSafe
+    from .versatile_diffusion import (
+        VersatileDiffusionDualGuidedPipeline,
+        VersatileDiffusionImageVariationPipeline,
+        VersatileDiffusionPipeline,
+        VersatileDiffusionTextToImagePipeline,
+    )
     from .vq_diffusion import VQDiffusionPipeline
 
 if is_transformers_available() and is_onnx_available():
diff --git a/src/diffusers/pipelines/versatile_diffusion/__init__.py b/src/diffusers/pipelines/versatile_diffusion/__init__.py
new file mode 100644
index 000000000000..65bc1b72007a
--- /dev/null
+++ b/src/diffusers/pipelines/versatile_diffusion/__init__.py
@@ -0,0 +1,9 @@
+from ...utils import is_torch_available, is_transformers_available
+
+
+if is_transformers_available() and is_torch_available():
+    from .modeling_text_unet import UNetFlatConditionModel
+    from .pipeline_versatile_diffusion import VersatileDiffusionPipeline
+    from .pipeline_versatile_diffusion_dual_guided import VersatileDiffusionDualGuidedPipeline
+    from .pipeline_versatile_diffusion_image_variation import VersatileDiffusionImageVariationPipeline
+    from .pipeline_versatile_diffusion_text_to_image import VersatileDiffusionTextToImagePipeline
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
new file mode 100644
index 000000000000..c89080a59e9d
--- /dev/null
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -0,0 +1,1082 @@
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...modeling_utils import ModelMixin
+from ...models.attention import DualTransformer2DModel, Transformer2DModel
+from ...models.embeddings import TimestepEmbedding, Timesteps
+from ...models.unet_2d_condition import UNet2DConditionOutput
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=None,
+):
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlockFlat":
+        return DownBlockFlat(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+        )
+    elif down_block_type == "CrossAttnDownBlockFlat":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlockFlat")
+        return CrossAttnDownBlockFlat(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    raise ValueError(f"{down_block_type} is not supported.")
+
+
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    dual_cross_attention=None,
+):
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlockFlat":
+        return UpBlockFlat(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+        )
+    elif up_block_type == "CrossAttnUpBlockFlat":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlockFlat")
+        return CrossAttnUpBlockFlat(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    raise ValueError(f"{up_block_type} is not supported.")
+
+
+# Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel with UNet2DConditionModel->UNetFlatConditionModel, nn.Conv2d->LinearMultiDim, Block2D->BlockFlat
+class UNetFlatConditionModel(ModelMixin, ConfigMixin):
+    r"""
+    UNetFlatConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a
+    timestep and returns sample shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the models (such as downloading or saving, etc.)
+
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockFlat", "CrossAttnDownBlockFlat", "CrossAttnDownBlockFlat", "DownBlockFlat")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlockFlat", "CrossAttnUpBlockFlat", "CrossAttnUpBlockFlat", "CrossAttnUpBlockFlat",)`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlockFlat",
+            "CrossAttnDownBlockFlat",
+            "CrossAttnDownBlockFlat",
+            "DownBlockFlat",
+        ),
+        up_block_types: Tuple[str] = (
+            "UpBlockFlat",
+            "CrossAttnUpBlockFlat",
+            "CrossAttnUpBlockFlat",
+            "CrossAttnUpBlockFlat",
+        ),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        attention_head_dim: int = 8,
+        dual_cross_attention: bool = False,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+
+        # input
+        self.conv_in = LinearMultiDim(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
+
+        # time
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim,
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlockFlatCrossAttn(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift="default",
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attention_head_dim,
+            resnet_groups=norm_num_groups,
+            dual_cross_attention=dual_cross_attention,
+        )
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim,
+                dual_cross_attention=dual_cross_attention,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps)
+        self.conv_act = nn.SiLU()
+        self.conv_out = LinearMultiDim(block_out_channels[0], out_channels, kernel_size=3, padding=1)
+
+    def set_attention_slice(self, slice_size):
+        if slice_size is not None and self.config.attention_head_dim % slice_size != 0:
+            raise ValueError(
+                f"Make sure slice_size {slice_size} is a divisor of "
+                f"the number of heads used in cross_attention {self.config.attention_head_dim}"
+            )
+        if slice_size is not None and slice_size > self.config.attention_head_dim:
+            raise ValueError(
+                f"Chunk_size {slice_size} has to be smaller or equal to "
+                f"the number of heads used in cross_attention {self.config.attention_head_dim}"
+            )
+
+        for block in self.down_blocks:
+            if hasattr(block, "attentions") and block.attentions is not None:
+                block.set_attention_slice(slice_size)
+
+        self.mid_block.set_attention_slice(slice_size)
+
+        for block in self.up_blocks:
+            if hasattr(block, "attentions") and block.attentions is not None:
+                block.set_attention_slice(slice_size)
+
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for block in self.down_blocks:
+            if hasattr(block, "attentions") and block.attentions is not None:
+                block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
+        self.mid_block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
+        for block in self.up_blocks:
+            if hasattr(block, "attentions") and block.attentions is not None:
+                block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlockFlat, DownBlockFlat, CrossAttnUpBlockFlat, UpBlockFlat)):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`):
+                (batch_size, sequence_length, hidden_size) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb)
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "attentions") and downsample_block.attentions is not None:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        sample = self.mid_block(sample, emb, encoder_hidden_states=encoder_hidden_states)
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "attentions") and upsample_block.attentions is not None:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    upsample_size=upsample_size,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet2DConditionOutput(sample=sample)
+
+
+class LinearMultiDim(nn.Linear):
+    def __init__(self, in_features, out_features=None, second_dim=4, *args, **kwargs):
+        in_features = [in_features, second_dim, 1] if isinstance(in_features, int) else list(in_features)
+        if out_features is None:
+            out_features = in_features
+        out_features = [out_features, second_dim, 1] if isinstance(out_features, int) else list(out_features)
+        self.in_features_multidim = in_features
+        self.out_features_multidim = out_features
+        super().__init__(np.array(in_features).prod(), np.array(out_features).prod())
+
+    def forward(self, input_tensor, *args, **kwargs):
+        shape = input_tensor.shape
+        n_dim = len(self.in_features_multidim)
+        input_tensor = input_tensor.reshape(*shape[0:-n_dim], self.in_features)
+        output_tensor = super().forward(input_tensor)
+        output_tensor = output_tensor.view(*shape[0:-n_dim], *self.out_features_multidim)
+        return output_tensor
+
+
+class ResnetBlockFlat(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        dropout=0.0,
+        temb_channels=512,
+        groups=32,
+        groups_out=None,
+        pre_norm=True,
+        eps=1e-6,
+        time_embedding_norm="default",
+        use_in_shortcut=None,
+        second_dim=4,
+        **kwargs,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+
+        in_channels = [in_channels, second_dim, 1] if isinstance(in_channels, int) else list(in_channels)
+        self.in_channels_prod = np.array(in_channels).prod()
+        self.channels_multidim = in_channels
+
+        if out_channels is not None:
+            out_channels = [out_channels, second_dim, 1] if isinstance(out_channels, int) else list(out_channels)
+            out_channels_prod = np.array(out_channels).prod()
+            self.out_channels_multidim = out_channels
+        else:
+            out_channels_prod = self.in_channels_prod
+            self.out_channels_multidim = self.channels_multidim
+        self.time_embedding_norm = time_embedding_norm
+
+        if groups_out is None:
+            groups_out = groups
+
+        self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=self.in_channels_prod, eps=eps, affine=True)
+        self.conv1 = torch.nn.Conv2d(self.in_channels_prod, out_channels_prod, kernel_size=1, padding=0)
+
+        if temb_channels is not None:
+            self.time_emb_proj = torch.nn.Linear(temb_channels, out_channels_prod)
+        else:
+            self.time_emb_proj = None
+
+        self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels_prod, eps=eps, affine=True)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels_prod, out_channels_prod, kernel_size=1, padding=0)
+
+        self.nonlinearity = nn.SiLU()
+
+        self.use_in_shortcut = (
+            self.in_channels_prod != out_channels_prod if use_in_shortcut is None else use_in_shortcut
+        )
+
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = torch.nn.Conv2d(
+                self.in_channels_prod, out_channels_prod, kernel_size=1, stride=1, padding=0
+            )
+
+    def forward(self, input_tensor, temb):
+        shape = input_tensor.shape
+        n_dim = len(self.channels_multidim)
+        input_tensor = input_tensor.reshape(*shape[0:-n_dim], self.in_channels_prod, 1, 1)
+        input_tensor = input_tensor.view(-1, self.in_channels_prod, 1, 1)
+
+        hidden_states = input_tensor
+
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        if temb is not None:
+            temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None]
+            hidden_states = hidden_states + temb
+
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+
+        output_tensor = input_tensor + hidden_states
+
+        output_tensor = output_tensor.view(*shape[0:-n_dim], -1)
+        output_tensor = output_tensor.view(*shape[0:-n_dim], *self.out_channels_multidim)
+
+        return output_tensor
+
+
+# Copied from diffusers.models.unet_2d_blocks.DownBlock2D with DownBlock2D->DownBlockFlat, ResnetBlock2D->ResnetBlockFlat, Downsample2D->LinearMultiDim
+class DownBlockFlat(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlockFlat(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    LinearMultiDim(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(self, hidden_states, temb=None):
+        output_states = ()
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+            else:
+                hidden_states = resnet(hidden_states, temb)
+
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+# Copied from diffusers.models.unet_2d_blocks.CrossAttnDownBlock2D with CrossAttnDownBlock2D->CrossAttnDownBlockFlat, ResnetBlock2D->ResnetBlockFlat, Downsample2D->LinearMultiDim
+class CrossAttnDownBlockFlat(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        attention_type="default",
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+        dual_cross_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlockFlat(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    LinearMultiDim(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def set_attention_slice(self, slice_size):
+        if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
+            raise ValueError(
+                f"Make sure slice_size {slice_size} is a divisor of "
+                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+            )
+        if slice_size is not None and slice_size > self.attn_num_head_channels:
+            raise ValueError(
+                f"Chunk_size {slice_size} has to be smaller or equal to "
+                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+            )
+
+        for attn in self.attentions:
+            attn._set_attention_slice(slice_size)
+
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for attn in self.attentions:
+            attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
+        output_states = ()
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+# Copied from diffusers.models.unet_2d_blocks.UpBlock2D with UpBlock2D->UpBlockFlat, ResnetBlock2D->ResnetBlockFlat, Upsample2D->LinearMultiDim
+class UpBlockFlat(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlockFlat(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([LinearMultiDim(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+            else:
+                hidden_states = resnet(hidden_states, temb)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+
+        return hidden_states
+
+
+# Copied from diffusers.models.unet_2d_blocks.CrossAttnUpBlock2D with CrossAttnUpBlock2D->CrossAttnUpBlockFlat, ResnetBlock2D->ResnetBlockFlat, Upsample2D->LinearMultiDim
+class CrossAttnUpBlockFlat(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        attention_type="default",
+        output_scale_factor=1.0,
+        add_upsample=True,
+        dual_cross_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlockFlat(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([LinearMultiDim(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def set_attention_slice(self, slice_size):
+        if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
+            raise ValueError(
+                f"Make sure slice_size {slice_size} is a divisor of "
+                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+            )
+        if slice_size is not None and slice_size > self.attn_num_head_channels:
+            raise ValueError(
+                f"Chunk_size {slice_size} has to be smaller or equal to "
+                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+            )
+
+        for attn in self.attentions:
+            attn._set_attention_slice(slice_size)
+
+        self.gradient_checkpointing = False
+
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for attn in self.attentions:
+            attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
+    def forward(
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+        upsample_size=None,
+    ):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+
+        return hidden_states
+
+
+# Copied from diffusers.models.unet_2d_blocks.UNetMidBlock2DCrossAttn with UNetMidBlock2DCrossAttn->UNetMidBlockFlatCrossAttn, ResnetBlock2D->ResnetBlockFlat
+class UNetMidBlockFlatCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        attention_type="default",
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        dual_cross_attention=False,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlockFlat(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        for _ in range(num_layers):
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        attn_num_head_channels,
+                        in_channels // attn_num_head_channels,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        attn_num_head_channels,
+                        in_channels // attn_num_head_channels,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            resnets.append(
+                ResnetBlockFlat(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def set_attention_slice(self, slice_size):
+        if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
+            raise ValueError(
+                f"Make sure slice_size {slice_size} is a divisor of "
+                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+            )
+        if slice_size is not None and slice_size > self.attn_num_head_channels:
+            raise ValueError(
+                f"Chunk_size {slice_size} has to be smaller or equal to "
+                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+            )
+
+        for attn in self.attentions:
+            attn._set_attention_slice(slice_size)
+
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for attn in self.attentions:
+            attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states, encoder_hidden_states).sample
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
new file mode 100644
index 000000000000..1280419c343d
--- /dev/null
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
@@ -0,0 +1,462 @@
+import inspect
+from typing import Callable, List, Optional, Union
+
+import torch
+
+import PIL.Image
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModel
+
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...pipeline_utils import DiffusionPipeline
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import logging
+from .pipeline_versatile_diffusion_dual_guided import VersatileDiffusionDualGuidedPipeline
+from .pipeline_versatile_diffusion_image_variation import VersatileDiffusionImageVariationPipeline
+from .pipeline_versatile_diffusion_text_to_image import VersatileDiffusionTextToImagePipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class VersatileDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionMegaSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    tokenizer: CLIPTokenizer
+    image_feature_extractor: CLIPFeatureExtractor
+    text_encoder: CLIPTextModel
+    image_encoder: CLIPVisionModel
+    image_unet: UNet2DConditionModel
+    text_unet: UNet2DConditionModel
+    vae: AutoencoderKL
+    scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        image_feature_extractor: CLIPFeatureExtractor,
+        text_encoder: CLIPTextModel,
+        image_encoder: CLIPVisionModel,
+        image_unet: UNet2DConditionModel,
+        text_unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            image_feature_extractor=image_feature_extractor,
+            text_encoder=text_encoder,
+            image_encoder=image_encoder,
+            image_unet=image_unet,
+            text_unet=text_unet,
+            vae=vae,
+            scheduler=scheduler,
+        )
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.image_unet.config.attention_head_dim // 2
+        self.image_unet.set_attention_slice(slice_size)
+        self.text_unet.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    @torch.no_grad()
+    def image_variation(
+        self,
+        image: Union[torch.FloatTensor, PIL.Image.Image],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image (`PIL.Image.Image`, `List[PIL.Image.Image]` or `torch.Tensor`):
+                The image prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import VersatileDiffusionPipeline
+        >>> import torch
+        >>> import requests
+        >>> from io import BytesIO
+        >>> from PIL import Image
+
+        >>> # let's download an initial image
+        >>> url = "https://huggingface.co/datasets/diffusers/images/resolve/main/benz.jpg"
+
+        >>> response = requests.get(url)
+        >>> image = Image.open(BytesIO(response.content)).convert("RGB")
+
+        >>> pipe = VersatileDiffusionPipeline.from_pretrained(
+        ...     "shi-labs/versatile-diffusion", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> image = pipe(image, generator=generator).images[0]
+        >>> image.save("./car_variation.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        expected_components = inspect.signature(VersatileDiffusionImageVariationPipeline.__init__).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
+        return VersatileDiffusionImageVariationPipeline(**components)(
+            image=image,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+        )
+
+    @torch.no_grad()
+    def text_to_image(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import VersatileDiffusionPipeline
+        >>> import torch
+
+        >>> pipe = VersatileDiffusionPipeline.from_pretrained(
+        ...     "shi-labs/versatile-diffusion", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> image = pipe.text_to_image("an astronaut riding on a horse on mars", generator=generator).images[0]
+        >>> image.save("./astronaut.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        expected_components = inspect.signature(VersatileDiffusionTextToImagePipeline.__init__).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
+        temp_pipeline = VersatileDiffusionTextToImagePipeline(**components)
+        output = temp_pipeline(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+        )
+        # swap the attention blocks back to the original state
+        temp_pipeline._swap_unet_attention_blocks()
+
+        return output
+
+    @torch.no_grad()
+    def dual_guided(
+        self,
+        prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
+        image: Union[str, List[str]],
+        text_to_image_strength: float = 0.5,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import VersatileDiffusionPipeline
+        >>> import torch
+        >>> import requests
+        >>> from io import BytesIO
+        >>> from PIL import Image
+
+        >>> # let's download an initial image
+        >>> url = "https://huggingface.co/datasets/diffusers/images/resolve/main/benz.jpg"
+
+        >>> response = requests.get(url)
+        >>> image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> text = "a red car in the sun"
+
+        >>> pipe = VersatileDiffusionPipeline.from_pretrained(
+        ...     "shi-labs/versatile-diffusion", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> text_to_image_strength = 0.75
+
+        >>> image = pipe.dual_guided(
+        ...     prompt=text, image=image, text_to_image_strength=text_to_image_strength, generator=generator
+        ... ).images[0]
+        >>> image.save("./car_variation.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.ImagePipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.ImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images.
+        """
+
+        expected_components = inspect.signature(VersatileDiffusionDualGuidedPipeline.__init__).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
+        temp_pipeline = VersatileDiffusionDualGuidedPipeline(**components)
+        output = temp_pipeline(
+            prompt=prompt,
+            image=image,
+            text_to_image_strength=text_to_image_strength,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+        )
+        temp_pipeline._revert_dual_attention()
+
+        return output
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
new file mode 100644
index 000000000000..ad4e8b0d0aa2
--- /dev/null
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
@@ -0,0 +1,628 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+
+import PIL
+from transformers import (
+    CLIPFeatureExtractor,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention import DualTransformer2DModel, Transformer2DModel
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import is_accelerate_available, logging
+from .modeling_text_unet import UNetFlatConditionModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class VersatileDiffusionDualGuidedPipeline(DiffusionPipeline):
+    r"""
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Parameters:
+        vqvae ([`VQModel`]):
+            Vector-quantized (VQ) Model to encode and decode images to and from latent representations.
+        bert ([`LDMBertModel`]):
+            Text-encoder model based on [BERT](https://huggingface.co/docs/transformers/model_doc/bert) architecture.
+        tokenizer (`transformers.BertTokenizer`):
+            Tokenizer of class
+            [BertTokenizer](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+    tokenizer: CLIPTokenizer
+    image_feature_extractor: CLIPFeatureExtractor
+    text_encoder: CLIPTextModelWithProjection
+    image_encoder: CLIPVisionModelWithProjection
+    image_unet: UNet2DConditionModel
+    text_unet: UNetFlatConditionModel
+    vae: AutoencoderKL
+    scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        image_feature_extractor: CLIPFeatureExtractor,
+        text_encoder: CLIPTextModelWithProjection,
+        image_encoder: CLIPVisionModelWithProjection,
+        image_unet: UNet2DConditionModel,
+        text_unet: UNetFlatConditionModel,
+        vae: AutoencoderKL,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+    ):
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer,
+            image_feature_extractor=image_feature_extractor,
+            text_encoder=text_encoder,
+            image_encoder=image_encoder,
+            image_unet=image_unet,
+            text_unet=text_unet,
+            vae=vae,
+            scheduler=scheduler,
+        )
+
+        if self.text_unet is not None and (
+            "dual_cross_attention" not in self.image_unet.config or not self.image_unet.config.dual_cross_attention
+        ):
+            # if loading from a universal checkpoint rather than a saved dual-guided pipeline
+            self._convert_to_dual_attention()
+
+    def remove_unused_weights(self):
+        self.register_modules(text_unet=None)
+
+    def _convert_to_dual_attention(self):
+        """
+        Replace image_unet's `Transformer2DModel` blocks with `DualTransformer2DModel` that contains transformer blocks
+        from both `image_unet` and `text_unet`
+        """
+        for name, module in self.image_unet.named_modules():
+            if isinstance(module, Transformer2DModel):
+                parent_name, index = name.rsplit(".", 1)
+                index = int(index)
+
+                image_transformer = self.image_unet.get_submodule(parent_name)[index]
+                text_transformer = self.text_unet.get_submodule(parent_name)[index]
+
+                config = image_transformer.config
+                dual_transformer = DualTransformer2DModel(
+                    num_attention_heads=config.num_attention_heads,
+                    attention_head_dim=config.attention_head_dim,
+                    in_channels=config.in_channels,
+                    num_layers=config.num_layers,
+                    dropout=config.dropout,
+                    norm_num_groups=config.norm_num_groups,
+                    cross_attention_dim=config.cross_attention_dim,
+                    attention_bias=config.attention_bias,
+                    sample_size=config.sample_size,
+                    num_vector_embeds=config.num_vector_embeds,
+                    activation_fn=config.activation_fn,
+                    num_embeds_ada_norm=config.num_embeds_ada_norm,
+                )
+                dual_transformer.transformers[0] = image_transformer
+                dual_transformer.transformers[1] = text_transformer
+
+                self.image_unet.get_submodule(parent_name)[index] = dual_transformer
+                self.image_unet.register_to_config(dual_cross_attention=True)
+
+    def _revert_dual_attention(self):
+        """
+        Revert the image_unet `DualTransformer2DModel` blocks back to `Transformer2DModel` with image_unet weights Call
+        this function if you reuse `image_unet` in another pipeline, e.g. `VersatileDiffusionPipeline`
+        """
+        for name, module in self.image_unet.named_modules():
+            if isinstance(module, DualTransformer2DModel):
+                parent_name, index = name.rsplit(".", 1)
+                index = int(index)
+                self.image_unet.get_submodule(parent_name)[index] = module.transformers[0]
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention with unet->image_unet
+    def enable_xformers_memory_efficient_attention(self):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        """
+        self.image_unet.set_use_memory_efficient_attention_xformers(True)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention with unet->image_unet
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.image_unet.set_use_memory_efficient_attention_xformers(False)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing with unet->image_unet
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.image_unet.config.attention_head_dim // 2
+        self.image_unet.set_attention_slice(slice_size)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_attention_slicing
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        for cpu_offloaded_model in [self.image_unet, self.text_unet, self.text_encoder, self.vae]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device with unet->image_unet
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.image_unet, "_hf_hook"):
+            return self.device
+        for module in self.image_unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    def _encode_text_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+        """
+
+        def normalize_embeddings(encoder_output):
+            embeds = self.text_encoder.text_projection(encoder_output.last_hidden_state)
+            embeds_pooled = encoder_output.text_embeds
+            embeds = embeds / torch.norm(embeds_pooled.unsqueeze(1), dim=-1, keepdim=True)
+            return embeds
+
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
+
+        if not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = normalize_embeddings(text_embeddings)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens = [""] * batch_size
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = normalize_embeddings(uncond_embeddings)
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
+    def _encode_image_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+        """
+
+        def normalize_embeddings(encoder_output):
+            embeds = self.image_encoder.vision_model.post_layernorm(encoder_output.last_hidden_state)
+            embeds = self.image_encoder.visual_projection(embeds)
+            embeds_pooled = embeds[:, 0:1]
+            embeds = embeds / torch.norm(embeds_pooled, dim=-1, keepdim=True)
+            return embeds
+
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        # get prompt text embeddings
+        image_input = self.image_feature_extractor(images=prompt, return_tensors="pt")
+        pixel_values = image_input.pixel_values.to(device).to(self.image_encoder.dtype)
+        image_embeddings = self.image_encoder(pixel_values)
+        image_embeddings = normalize_embeddings(image_embeddings)
+
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_images = [np.zeros((512, 512, 3)) + 0.5] * batch_size
+            uncond_images = self.image_feature_extractor(images=uncond_images, return_tensors="pt")
+            pixel_values = uncond_images.pixel_values.to(device).to(self.image_encoder.dtype)
+            uncond_embeddings = self.image_encoder(pixel_values)
+            uncond_embeddings = normalize_embeddings(uncond_embeddings)
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and conditional embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([uncond_embeddings, image_embeddings])
+
+        return image_embeddings
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(self, prompt, image, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, PIL.Image.Image) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` `PIL.Image` or `list` but is {type(prompt)}")
+        if not isinstance(image, str) and not isinstance(image, PIL.Image.Image) and not isinstance(image, list):
+            raise ValueError(f"`image` has to be of type `str` `PIL.Image` or `list` but is {type(image)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // 8, width // 8)
+        if latents is None:
+            if device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+            else:
+                latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def set_transformer_params(self, mix_ratio: float = 0.5, condition_types: Tuple = ("text", "image")):
+        for name, module in self.image_unet.named_modules():
+            if isinstance(module, DualTransformer2DModel):
+                module.mix_ratio = mix_ratio
+
+                for i, type in enumerate(condition_types):
+                    if type == "text":
+                        module.condition_lengths[i] = self.text_encoder.config.max_position_embeddings
+                        module.transformer_index_for_condition[i] = 1  # use the second (text) transformer
+                    else:
+                        module.condition_lengths[i] = 257
+                        module.transformer_index_for_condition[i] = 0  # use the first (image) transformer
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
+        image: Union[str, List[str]],
+        text_to_image_strength: float = 0.5,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import VersatileDiffusionDualGuidedPipeline
+        >>> import torch
+        >>> import requests
+        >>> from io import BytesIO
+        >>> from PIL import Image
+
+        >>> # let's download an initial image
+        >>> url = "https://huggingface.co/datasets/diffusers/images/resolve/main/benz.jpg"
+
+        >>> response = requests.get(url)
+        >>> image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> text = "a red car in the sun"
+
+        >>> pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(
+        ...     "shi-labs/versatile-diffusion", torch_dtype=torch.float16
+        ... )
+        >>> pipe.remove_unused_weights()
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> text_to_image_strength = 0.75
+
+        >>> image = pipe(
+        ...     prompt=text, image=image, text_to_image_strength=text_to_image_strength, generator=generator
+        ... ).images[0]
+        >>> image.save("./car_variation.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.ImagePipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.ImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images.
+        """
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, image, height, width, callback_steps)
+
+        # 2. Define call parameters
+        prompt = [prompt] if not isinstance(prompt, list) else prompt
+        image = [image] if not isinstance(image, list) else image
+        batch_size = len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompts
+        text_embeddings = self._encode_text_prompt(prompt, device, num_images_per_prompt, do_classifier_free_guidance)
+        image_embeddings = self._encode_image_prompt(image, device, num_images_per_prompt, do_classifier_free_guidance)
+        dual_prompt_embeddings = torch.cat([text_embeddings, image_embeddings], dim=1)
+        prompt_types = ("text", "image")
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.image_unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            dual_prompt_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Combine the attention blocks of the image and text UNets
+        self.set_transformer_params(text_to_image_strength, prompt_types)
+
+        # 8. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=dual_prompt_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        # 9. Post-processing
+        image = self.decode_latents(latents)
+
+        # 10. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
new file mode 100644
index 000000000000..652b7b735af4
--- /dev/null
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
@@ -0,0 +1,462 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+
+import PIL
+from transformers import CLIPFeatureExtractor, CLIPVisionModelWithProjection
+
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import is_accelerate_available, logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
+    r"""
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Parameters:
+        vqvae ([`VQModel`]):
+            Vector-quantized (VQ) Model to encode and decode images to and from latent representations.
+        bert ([`LDMBertModel`]):
+            Text-encoder model based on [BERT](https://huggingface.co/docs/transformers/model_doc/bert) architecture.
+        tokenizer (`transformers.BertTokenizer`):
+            Tokenizer of class
+            [BertTokenizer](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+    image_feature_extractor: CLIPFeatureExtractor
+    image_encoder: CLIPVisionModelWithProjection
+    image_unet: UNet2DConditionModel
+    vae: AutoencoderKL
+    scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
+
+    def __init__(
+        self,
+        image_feature_extractor: CLIPFeatureExtractor,
+        image_encoder: CLIPVisionModelWithProjection,
+        image_unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+    ):
+        super().__init__()
+        self.register_modules(
+            image_feature_extractor=image_feature_extractor,
+            image_encoder=image_encoder,
+            image_unet=image_unet,
+            vae=vae,
+            scheduler=scheduler,
+        )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention with unet->image_unet
+    def enable_xformers_memory_efficient_attention(self):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        """
+        self.image_unet.set_use_memory_efficient_attention_xformers(True)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention with unet->image_unet
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.image_unet.set_use_memory_efficient_attention_xformers(False)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing with unet->image_unet
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.image_unet.config.attention_head_dim // 2
+        self.image_unet.set_attention_slice(slice_size)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_attention_slicing
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        for cpu_offloaded_model in [self.image_unet, self.text_unet, self.text_encoder, self.vae]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device with unet->image_unet
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.image_unet, "_hf_hook"):
+            return self.device
+        for module in self.image_unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+
+        def normalize_embeddings(encoder_output):
+            embeds = self.image_encoder.vision_model.post_layernorm(encoder_output.last_hidden_state)
+            embeds = self.image_encoder.visual_projection(embeds)
+            embeds_pooled = embeds[:, 0:1]
+            embeds = embeds / torch.norm(embeds_pooled, dim=-1, keepdim=True)
+            return embeds
+
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        # get prompt text embeddings
+        image_input = self.image_feature_extractor(images=prompt, return_tensors="pt")
+        pixel_values = image_input.pixel_values.to(device).to(self.image_encoder.dtype)
+        image_embeddings = self.image_encoder(pixel_values)
+        image_embeddings = normalize_embeddings(image_embeddings)
+
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_images: List[str]
+            if negative_prompt is None:
+                uncond_images = [np.zeros((512, 512, 3)) + 0.5] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, PIL.Image.Image):
+                uncond_images = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_images = negative_prompt
+
+            uncond_images = self.image_feature_extractor(images=uncond_images, return_tensors="pt")
+            pixel_values = uncond_images.pixel_values.to(device).to(self.image_encoder.dtype)
+            uncond_embeddings = self.image_encoder(pixel_values)
+            uncond_embeddings = normalize_embeddings(uncond_embeddings)
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and conditional embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([uncond_embeddings, image_embeddings])
+
+        return image_embeddings
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(self, image, height, width, callback_steps):
+        if not isinstance(image, PIL.Image.Image) and not isinstance(image, torch.Tensor):
+            raise ValueError(f"`image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(image)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // 8, width // 8)
+        if latents is None:
+            if device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+            else:
+                latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image (`PIL.Image.Image`, `List[PIL.Image.Image]` or `torch.Tensor`):
+                The image prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import VersatileDiffusionImageVariationPipeline
+        >>> import torch
+        >>> import requests
+        >>> from io import BytesIO
+        >>> from PIL import Image
+
+        >>> # let's download an initial image
+        >>> url = "https://huggingface.co/datasets/diffusers/images/resolve/main/benz.jpg"
+
+        >>> response = requests.get(url)
+        >>> image = Image.open(BytesIO(response.content)).convert("RGB")
+
+        >>> pipe = VersatileDiffusionImageVariationPipeline.from_pretrained(
+        ...     "shi-labs/versatile-diffusion", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> image = pipe(image, generator=generator).images[0]
+        >>> image.save("./car_variation.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, height, width, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(image, PIL.Image.Image) else len(image)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        image_embeddings = self._encode_prompt(
+            image, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.image_unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        # 8. Post-processing
+        image = self.decode_latents(latents)
+
+        # 9. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
new file mode 100644
index 000000000000..d07d734a642f
--- /dev/null
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
@@ -0,0 +1,514 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+
+from transformers import CLIPFeatureExtractor, CLIPTextModelWithProjection, CLIPTokenizer
+
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention import Transformer2DModel
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import is_accelerate_available, logging
+from .modeling_text_unet import UNetFlatConditionModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
+    r"""
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Parameters:
+        vqvae ([`VQModel`]):
+            Vector-quantized (VQ) Model to encode and decode images to and from latent representations.
+        bert ([`LDMBertModel`]):
+            Text-encoder model based on [BERT](https://huggingface.co/docs/transformers/model_doc/bert) architecture.
+        tokenizer (`transformers.BertTokenizer`):
+            Tokenizer of class
+            [BertTokenizer](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+    tokenizer: CLIPTokenizer
+    image_feature_extractor: CLIPFeatureExtractor
+    text_encoder: CLIPTextModelWithProjection
+    image_unet: UNet2DConditionModel
+    text_unet: UNetFlatConditionModel
+    vae: AutoencoderKL
+    scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModelWithProjection,
+        image_unet: UNet2DConditionModel,
+        text_unet: UNetFlatConditionModel,
+        vae: AutoencoderKL,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+    ):
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            image_unet=image_unet,
+            text_unet=text_unet,
+            vae=vae,
+            scheduler=scheduler,
+        )
+
+        if self.text_unet is not None:
+            self._swap_unet_attention_blocks()
+
+    def _swap_unet_attention_blocks(self):
+        """
+        Swap the `Transformer2DModel` blocks between the image and text UNets
+        """
+        for name, module in self.image_unet.named_modules():
+            if isinstance(module, Transformer2DModel):
+                parent_name, index = name.rsplit(".", 1)
+                index = int(index)
+                self.image_unet.get_submodule(parent_name)[index], self.text_unet.get_submodule(parent_name)[index] = (
+                    self.text_unet.get_submodule(parent_name)[index],
+                    self.image_unet.get_submodule(parent_name)[index],
+                )
+
+    def remove_unused_weights(self):
+        self.register_modules(text_unet=None)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention with unet->image_unet
+    def enable_xformers_memory_efficient_attention(self):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        """
+        self.image_unet.set_use_memory_efficient_attention_xformers(True)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention with unet->image_unet
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.image_unet.set_use_memory_efficient_attention_xformers(False)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing with unet->image_unet
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.image_unet.config.attention_head_dim // 2
+        self.image_unet.set_attention_slice(slice_size)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_attention_slicing
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        for cpu_offloaded_model in [self.image_unet, self.text_unet, self.text_encoder, self.vae]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device with unet->image_unet
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.image_unet, "_hf_hook"):
+            return self.device
+        for module in self.image_unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+
+        def normalize_embeddings(encoder_output):
+            embeds = self.text_encoder.text_projection(encoder_output.last_hidden_state)
+            embeds_pooled = encoder_output.text_embeds
+            embeds = embeds / torch.norm(embeds_pooled.unsqueeze(1), dim=-1, keepdim=True)
+            return embeds
+
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
+
+        if not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = normalize_embeddings(text_embeddings)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = normalize_embeddings(uncond_embeddings)
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // 8, width // 8)
+        if latents is None:
+            if device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+            else:
+                latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import VersatileDiffusionTextToImagePipeline
+        >>> import torch
+
+        >>> pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(
+        ...     "shi-labs/versatile-diffusion", torch_dtype=torch.float16
+        ... )
+        >>> pipe.remove_unused_weights()
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> image = pipe("an astronaut riding on a horse on mars", generator=generator).images[0]
+        >>> image.save("./astronaut.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.image_unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        # 9. Post-processing
+        image = self.decode_latents(latents)
+
+        # 10. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index c184b6295de7..d255c174c743 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -154,6 +154,66 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class VersatileDiffusionDualGuidedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class VersatileDiffusionImageVariationPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class VersatileDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class VersatileDiffusionTextToImagePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class VQDiffusionPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/pipelines/versatile_diffusion/__init__.py b/tests/pipelines/versatile_diffusion/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py
new file mode 100644
index 000000000000..9fb6ca522f5d
--- /dev/null
+++ b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py
@@ -0,0 +1,112 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import VersatileDiffusionDualGuidedPipeline
+from diffusers.utils.testing_utils import load_image, require_torch_gpu, slow, torch_device
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class VersatileDiffusionDualGuidedPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pass
+
+
+@slow
+@require_torch_gpu
+class VersatileDiffusionDualGuidedPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_remove_unused_weights_save_load(self):
+        pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
+        # remove text_unet
+        pipe.remove_unused_weights()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        second_prompt = load_image(
+            "https://raw.githubusercontent.com/SHI-Labs/Versatile-Diffusion/master/assets/benz.jpg"
+        )
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = pipe(
+            prompt="first prompt",
+            image=second_prompt,
+            text_to_image_strength=0.75,
+            generator=generator,
+            guidance_scale=7.5,
+            num_inference_steps=2,
+            output_type="numpy",
+        ).images
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(tmpdirname)
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = generator.manual_seed(0)
+        new_image = pipe(
+            prompt="first prompt",
+            image=second_prompt,
+            text_to_image_strength=0.75,
+            generator=generator,
+            guidance_scale=7.5,
+            num_inference_steps=2,
+            output_type="numpy",
+        ).images
+
+        assert np.abs(image - new_image).sum() < 1e-5, "Models don't have the same forward pass"
+
+    def test_inference_dual_guided(self):
+        pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
+        pipe.remove_unused_weights()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        first_prompt = "cyberpunk 2077"
+        second_prompt = load_image(
+            "https://raw.githubusercontent.com/SHI-Labs/Versatile-Diffusion/master/assets/benz.jpg"
+        )
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = pipe(
+            prompt=first_prompt,
+            image=second_prompt,
+            text_to_image_strength=0.75,
+            generator=generator,
+            guidance_scale=7.5,
+            num_inference_steps=50,
+            output_type="numpy",
+        ).images
+
+        image_slice = image[0, 253:256, 253:256, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.014, 0.0112, 0.0136, 0.0145, 0.0107, 0.0113, 0.0272, 0.0215, 0.0216])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
new file mode 100644
index 000000000000..4eddc271db52
--- /dev/null
+++ b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
@@ -0,0 +1,58 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import VersatileDiffusionImageVariationPipeline
+from diffusers.utils.testing_utils import load_image, require_torch_gpu, slow, torch_device
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class VersatileDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pass
+
+
+@slow
+@require_torch_gpu
+class VersatileDiffusionImageVariationPipelineIntegrationTests(unittest.TestCase):
+    def test_inference_image_variations(self):
+        pipe = VersatileDiffusionImageVariationPipeline.from_pretrained("shi-labs/versatile-diffusion")
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        image_prompt = load_image(
+            "https://raw.githubusercontent.com/SHI-Labs/Versatile-Diffusion/master/assets/benz.jpg"
+        )
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = pipe(
+            image=image_prompt,
+            generator=generator,
+            guidance_scale=7.5,
+            num_inference_steps=50,
+            output_type="numpy",
+        ).images
+
+        image_slice = image[0, 253:256, 253:256, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0113, 0.2241, 0.4024, 0.0839, 0.0871, 0.2725, 0.2581, 0.0, 0.1096])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
new file mode 100644
index 000000000000..1209abf6a8db
--- /dev/null
+++ b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
@@ -0,0 +1,129 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import VersatileDiffusionPipeline
+from diffusers.utils.testing_utils import load_image, require_torch_gpu, slow, torch_device
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class VersatileDiffusionMegaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pass
+
+
+@slow
+@require_torch_gpu
+class VersatileDiffusionMegaPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_from_pretrained_save_pretrained(self):
+        pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion", torch_dtype=torch.float16)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt_image = load_image(
+            "https://raw.githubusercontent.com/SHI-Labs/Versatile-Diffusion/master/assets/benz.jpg"
+        )
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = pipe.dual_guided(
+            prompt="first prompt",
+            image=prompt_image,
+            text_to_image_strength=0.75,
+            generator=generator,
+            guidance_scale=7.5,
+            num_inference_steps=2,
+            output_type="numpy",
+        ).images
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe = VersatileDiffusionPipeline.from_pretrained(tmpdirname, torch_dtype=torch.float16)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = generator.manual_seed(0)
+        new_image = pipe.dual_guided(
+            prompt="first prompt",
+            image=prompt_image,
+            text_to_image_strength=0.75,
+            generator=generator,
+            guidance_scale=7.5,
+            num_inference_steps=2,
+            output_type="numpy",
+        ).images
+
+        assert np.abs(image - new_image).sum() < 1e-5, "Models don't have the same forward pass"
+
+    def test_inference_dual_guided_then_text_to_image(self):
+        pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion", torch_dtype=torch.float16)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "cyberpunk 2077"
+        init_image = load_image(
+            "https://raw.githubusercontent.com/SHI-Labs/Versatile-Diffusion/master/assets/benz.jpg"
+        )
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = pipe.dual_guided(
+            prompt=prompt,
+            image=init_image,
+            text_to_image_strength=0.75,
+            generator=generator,
+            guidance_scale=7.5,
+            num_inference_steps=50,
+            output_type="numpy",
+        ).images
+
+        image_slice = image[0, 253:256, 253:256, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.014, 0.0112, 0.0136, 0.0145, 0.0107, 0.0113, 0.0272, 0.0215, 0.0216])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+        prompt = "A painting of a squirrel eating a burger "
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = pipe.text_to_image(
+            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=50, output_type="numpy"
+        ).images
+
+        image_slice = image[0, 253:256, 253:256, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0408, 0.0181, 0.0, 0.0388, 0.0046, 0.0461, 0.0411, 0.0, 0.0222])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+        pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion", torch_dtype=torch.float16)
+        image = pipe.image_variation(init_image, generator=generator, output_type="numpy").images[0]
+
+        image_slice = image[0, 253:256, 253:256, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0657, 0.0529, 0.0455, 0.0802, 0.0570, 0.0179, 0.0267, 0.0483, 0.0769])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py
new file mode 100644
index 000000000000..027819efee9f
--- /dev/null
+++ b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py
@@ -0,0 +1,86 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import VersatileDiffusionTextToImagePipeline
+from diffusers.utils.testing_utils import require_torch_gpu, slow, torch_device
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class VersatileDiffusionTextToImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pass
+
+
+@slow
+@require_torch_gpu
+class VersatileDiffusionTextToImagePipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_remove_unused_weights_save_load(self):
+        pipe = VersatileDiffusionTextToImagePipeline.from_pretrained("shi-labs/versatile-diffusion")
+        # remove text_unet
+        pipe.remove_unused_weights()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger "
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = pipe(
+            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=2, output_type="numpy"
+        ).images
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(tmpdirname)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = generator.manual_seed(0)
+        new_image = pipe(
+            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=2, output_type="numpy"
+        ).images
+
+        assert np.abs(image - new_image).sum() < 1e-5, "Models don't have the same forward pass"
+
+    def test_inference_text2img(self):
+        pipe = VersatileDiffusionTextToImagePipeline.from_pretrained("shi-labs/versatile-diffusion")
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger "
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = pipe(
+            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=50, output_type="numpy"
+        ).images
+
+        image_slice = image[0, 253:256, 253:256, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0408, 0.0181, 0.0, 0.0388, 0.0046, 0.0461, 0.0411, 0.0, 0.0222])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

From 16a32c9dab03d41204120b63cdae71c40b279bdf Mon Sep 17 00:00:00 2001
From: anton-l <anton@huggingface.co>
Date: Wed, 23 Nov 2022 19:12:31 +0100
Subject: [PATCH 236/269] Release: v0.8.0

---
 setup.py                  | 2 +-
 src/diffusers/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index d0aff10da63e..a4c336669e45 100644
--- a/setup.py
+++ b/setup.py
@@ -212,7 +212,7 @@ def run(self):
 
 setup(
     name="diffusers",
-    version="0.8.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="0.8.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     description="Diffusers",
     long_description=open("README.md", "r", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index c4052be34e1e..a1faa28000af 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -9,7 +9,7 @@
 )
 
 
-__version__ = "0.8.0.dev0"
+__version__ = "0.8.0"
 
 from .configuration_utils import ConfigMixin
 from .onnx_utils import OnnxRuntimeModel

From f07a16e09bb5b1cf4fa2306bfa4ea791f24fa968 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Wed, 23 Nov 2022 20:46:30 +0100
Subject: [PATCH 237/269] update unet2d (#1376)

* boom boom

* remove duplicate arg

* add use_linear_proj arg

* fix copies

* style

* add fast tests

* use_linear_proj -> use_linear_projection
---
 src/diffusers/models/attention.py             | 35 +++++++++++++----
 src/diffusers/models/unet_2d_blocks.py        | 10 +++++
 src/diffusers/models/unet_2d_condition.py     | 21 ++++++----
 .../versatile_diffusion/modeling_text_unet.py | 27 +++++++++----
 tests/models/test_models_unet_2d.py           | 38 +++++++++++++++++++
 5 files changed, 110 insertions(+), 21 deletions(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 6b2bd5205bdb..92d84acbbef7 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -99,8 +99,10 @@ def __init__(
         num_vector_embeds: Optional[int] = None,
         activation_fn: str = "geglu",
         num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
     ):
         super().__init__()
+        self.use_linear_projection = use_linear_projection
         self.num_attention_heads = num_attention_heads
         self.attention_head_dim = attention_head_dim
         inner_dim = num_attention_heads * attention_head_dim
@@ -126,7 +128,10 @@ def __init__(
             self.in_channels = in_channels
 
             self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
-            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+            if use_linear_projection:
+                self.proj_in = nn.Linear(in_channels, inner_dim)
+            else:
+                self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
         elif self.is_input_vectorized:
             assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
             assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
@@ -159,7 +164,10 @@ def __init__(
 
         # 4. Define output layers
         if self.is_input_continuous:
-            self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+            if use_linear_projection:
+                self.proj_out = nn.Linear(in_channels, inner_dim)
+            else:
+                self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
         elif self.is_input_vectorized:
             self.norm_out = nn.LayerNorm(inner_dim)
             self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
@@ -191,10 +199,18 @@ def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, retu
         if self.is_input_continuous:
             batch, channel, height, weight = hidden_states.shape
             residual = hidden_states
+
             hidden_states = self.norm(hidden_states)
-            hidden_states = self.proj_in(hidden_states)
-            inner_dim = hidden_states.shape[1]
-            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+
+            if not self.use_linear_projection:
+                hidden_states = self.proj_in(hidden_states)
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+            else:
+                hidden_states = self.norm(hidden_states)
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+                hidden_states = self.proj_in(hidden_states)
         elif self.is_input_vectorized:
             hidden_states = self.latent_image_embedding(hidden_states)
 
@@ -204,8 +220,13 @@ def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, retu
 
         # 3. Output
         if self.is_input_continuous:
-            hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2)
-            hidden_states = self.proj_out(hidden_states)
+            if not self.use_linear_projection:
+                hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2)
+                hidden_states = self.proj_out(hidden_states)
+            else:
+                hidden_states = self.proj_out(hidden_states)
+                hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2)
+
             output = hidden_states + residual
         elif self.is_input_vectorized:
             hidden_states = self.norm_out(hidden_states)
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index 4dd15845e04f..5a8a97187f11 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -33,6 +33,7 @@ def get_down_block(
     cross_attention_dim=None,
     downsample_padding=None,
     dual_cross_attention=False,
+    use_linear_projection=False,
 ):
     down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
     if down_block_type == "DownBlock2D":
@@ -76,6 +77,7 @@ def get_down_block(
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attn_num_head_channels,
             dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
         )
     elif down_block_type == "SkipDownBlock2D":
         return SkipDownBlock2D(
@@ -140,6 +142,7 @@ def get_up_block(
     resnet_groups=None,
     cross_attention_dim=None,
     dual_cross_attention=False,
+    use_linear_projection=False,
 ):
     up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
     if up_block_type == "UpBlock2D":
@@ -170,6 +173,7 @@ def get_up_block(
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attn_num_head_channels,
             dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
         )
     elif up_block_type == "AttnUpBlock2D":
         return AttnUpBlock2D(
@@ -327,6 +331,7 @@ def __init__(
         output_scale_factor=1.0,
         cross_attention_dim=1280,
         dual_cross_attention=False,
+        use_linear_projection=False,
         **kwargs,
     ):
         super().__init__()
@@ -362,6 +367,7 @@ def __init__(
                         num_layers=1,
                         cross_attention_dim=cross_attention_dim,
                         norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
                     )
                 )
             else:
@@ -523,6 +529,7 @@ def __init__(
         downsample_padding=1,
         add_downsample=True,
         dual_cross_attention=False,
+        use_linear_projection=False,
     ):
         super().__init__()
         resnets = []
@@ -556,6 +563,7 @@ def __init__(
                         num_layers=1,
                         cross_attention_dim=cross_attention_dim,
                         norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
                     )
                 )
             else:
@@ -1120,6 +1128,7 @@ def __init__(
         output_scale_factor=1.0,
         add_upsample=True,
         dual_cross_attention=False,
+        use_linear_projection=False,
     ):
         super().__init__()
         resnets = []
@@ -1155,6 +1164,7 @@ def __init__(
                         num_layers=1,
                         cross_attention_dim=cross_attention_dim,
                         norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
                     )
                 )
             else:
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 4eaed803ceee..206097149331 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -61,7 +61,7 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin):
         in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
         out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
         center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
-        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
             Whether to flip the sin to cos in the time embedding.
         freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
         down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
@@ -106,8 +106,9 @@ def __init__(
         norm_num_groups: int = 32,
         norm_eps: float = 1e-5,
         cross_attention_dim: int = 1280,
-        attention_head_dim: int = 8,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
         dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
     ):
         super().__init__()
 
@@ -127,6 +128,9 @@ def __init__(
         self.mid_block = None
         self.up_blocks = nn.ModuleList([])
 
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
         # down
         output_channel = block_out_channels[0]
         for i, down_block_type in enumerate(down_block_types):
@@ -145,9 +149,10 @@ def __init__(
                 resnet_act_fn=act_fn,
                 resnet_groups=norm_num_groups,
                 cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim,
+                attn_num_head_channels=attention_head_dim[i],
                 downsample_padding=downsample_padding,
                 dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
             )
             self.down_blocks.append(down_block)
 
@@ -160,9 +165,10 @@ def __init__(
             output_scale_factor=mid_block_scale_factor,
             resnet_time_scale_shift="default",
             cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attention_head_dim,
+            attn_num_head_channels=attention_head_dim[-1],
             resnet_groups=norm_num_groups,
             dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
         )
 
         # count how many layers upsample the images
@@ -170,6 +176,7 @@ def __init__(
 
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
         output_channel = reversed_block_out_channels[0]
         for i, up_block_type in enumerate(up_block_types):
             is_final_block = i == len(block_out_channels) - 1
@@ -197,8 +204,9 @@ def __init__(
                 resnet_act_fn=act_fn,
                 resnet_groups=norm_num_groups,
                 cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim,
+                attn_num_head_channels=reversed_attention_head_dim[i],
                 dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
@@ -256,8 +264,7 @@ def forward(
         Args:
             sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
             timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
-            encoder_hidden_states (`torch.FloatTensor`):
-                (batch_size, sequence_length, hidden_size) encoder hidden states
+            encoder_hidden_states (`torch.FloatTensor`): (batch, channel, height, width) encoder hidden states
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
 
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index c89080a59e9d..6d521228e31b 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -124,7 +124,7 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
         in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
         out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
         center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
-        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
             Whether to flip the sin to cos in the time embedding.
         freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
         down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockFlat", "CrossAttnDownBlockFlat", "CrossAttnDownBlockFlat", "DownBlockFlat")`):
@@ -174,8 +174,9 @@ def __init__(
         norm_num_groups: int = 32,
         norm_eps: float = 1e-5,
         cross_attention_dim: int = 1280,
-        attention_head_dim: int = 8,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
         dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
     ):
         super().__init__()
 
@@ -195,6 +196,9 @@ def __init__(
         self.mid_block = None
         self.up_blocks = nn.ModuleList([])
 
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
         # down
         output_channel = block_out_channels[0]
         for i, down_block_type in enumerate(down_block_types):
@@ -213,9 +217,10 @@ def __init__(
                 resnet_act_fn=act_fn,
                 resnet_groups=norm_num_groups,
                 cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim,
+                attn_num_head_channels=attention_head_dim[i],
                 downsample_padding=downsample_padding,
                 dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
             )
             self.down_blocks.append(down_block)
 
@@ -228,9 +233,10 @@ def __init__(
             output_scale_factor=mid_block_scale_factor,
             resnet_time_scale_shift="default",
             cross_attention_dim=cross_attention_dim,
-            attn_num_head_channels=attention_head_dim,
+            attn_num_head_channels=attention_head_dim[-1],
             resnet_groups=norm_num_groups,
             dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
         )
 
         # count how many layers upsample the images
@@ -238,6 +244,7 @@ def __init__(
 
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
         output_channel = reversed_block_out_channels[0]
         for i, up_block_type in enumerate(up_block_types):
             is_final_block = i == len(block_out_channels) - 1
@@ -265,8 +272,9 @@ def __init__(
                 resnet_act_fn=act_fn,
                 resnet_groups=norm_num_groups,
                 cross_attention_dim=cross_attention_dim,
-                attn_num_head_channels=attention_head_dim,
+                attn_num_head_channels=reversed_attention_head_dim[i],
                 dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
@@ -324,8 +332,7 @@ def forward(
         Args:
             sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
             timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
-            encoder_hidden_states (`torch.FloatTensor`):
-                (batch_size, sequence_length, hidden_size) encoder hidden states
+            encoder_hidden_states (`torch.FloatTensor`): (batch, channel, height, width) encoder hidden states
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
 
@@ -640,6 +647,7 @@ def __init__(
         downsample_padding=1,
         add_downsample=True,
         dual_cross_attention=False,
+        use_linear_projection=False,
     ):
         super().__init__()
         resnets = []
@@ -673,6 +681,7 @@ def __init__(
                         num_layers=1,
                         cross_attention_dim=cross_attention_dim,
                         norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
                     )
                 )
             else:
@@ -851,6 +860,7 @@ def __init__(
         output_scale_factor=1.0,
         add_upsample=True,
         dual_cross_attention=False,
+        use_linear_projection=False,
     ):
         super().__init__()
         resnets = []
@@ -886,6 +896,7 @@ def __init__(
                         num_layers=1,
                         cross_attention_dim=cross_attention_dim,
                         norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
                     )
                 )
             else:
@@ -988,6 +999,7 @@ def __init__(
         output_scale_factor=1.0,
         cross_attention_dim=1280,
         dual_cross_attention=False,
+        use_linear_projection=False,
         **kwargs,
     ):
         super().__init__()
@@ -1023,6 +1035,7 @@ def __init__(
                         num_layers=1,
                         cross_attention_dim=cross_attention_dim,
                         norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
                     )
                 )
             else:
diff --git a/tests/models/test_models_unet_2d.py b/tests/models/test_models_unet_2d.py
index 81437311c658..02c6d314bfff 100644
--- a/tests/models/test_models_unet_2d.py
+++ b/tests/models/test_models_unet_2d.py
@@ -296,6 +296,44 @@ def test_gradient_checkpointing(self):
         for name, param in named_params.items():
             self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=5e-5))
 
+    def test_model_with_attention_head_dim_tuple(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["attention_head_dim"] = (8, 16)
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.sample
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    def test_model_with_use_linear_projection(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        init_dict["use_linear_projection"] = True
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output.sample
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
 
 class NCSNppModelTests(ModelTesterMixin, unittest.TestCase):
     model_class = UNet2DModel

From 1524122532927dfd8ff80b0899344e696a7ab47a Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Thu, 24 Nov 2022 00:12:45 +0100
Subject: [PATCH 238/269] [Transformer2DModel] don't norm twice (#1381)

don't norm twice
---
 src/diffusers/models/attention.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 92d84acbbef7..0aacddf34d00 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -201,13 +201,11 @@ def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, retu
             residual = hidden_states
 
             hidden_states = self.norm(hidden_states)
-
             if not self.use_linear_projection:
                 hidden_states = self.proj_in(hidden_states)
                 inner_dim = hidden_states.shape[1]
                 hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
             else:
-                hidden_states = self.norm(hidden_states)
                 inner_dim = hidden_states.shape[1]
                 hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
                 hidden_states = self.proj_in(hidden_states)

From 35d8186172e598f6382e2a896db5f9dfa9896ba8 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 24 Nov 2022 00:24:05 +0100
Subject: [PATCH 239/269] [Bad dependencies] Fix imports (#1382)

* fix imports

* better error

* up

* finish
---
 .../pipelines/stable_diffusion/__init__.py    | 15 +++++++++--
 .../pipelines/versatile_diffusion/__init__.py | 11 ++++++--
 src/diffusers/utils/__init__.py               |  1 +
 src/diffusers/utils/import_utils.py           | 25 +++++++++++++++++++
 4 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
index 0b2fa15d7686..b3f3a911b1b3 100644
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -6,7 +6,14 @@
 import PIL
 from PIL import Image
 
-from ...utils import BaseOutput, is_flax_available, is_onnx_available, is_torch_available, is_transformers_available
+from ...utils import (
+    BaseOutput,
+    is_flax_available,
+    is_onnx_available,
+    is_torch_available,
+    is_transformers_available,
+    is_transformers_version,
+)
 
 
 @dataclass
@@ -30,12 +37,16 @@ class StableDiffusionPipelineOutput(BaseOutput):
 if is_transformers_available() and is_torch_available():
     from .pipeline_cycle_diffusion import CycleDiffusionPipeline
     from .pipeline_stable_diffusion import StableDiffusionPipeline
-    from .pipeline_stable_diffusion_image_variation import StableDiffusionImageVariationPipeline
     from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
     from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
     from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy
     from .safety_checker import StableDiffusionSafetyChecker
 
+if is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0"):
+    from .pipeline_stable_diffusion_image_variation import StableDiffusionImageVariationPipeline
+else:
+    from ...utils.dummy_torch_and_transformers_objects import StableDiffusionImageVariationPipeline
+
 if is_transformers_available() and is_onnx_available():
     from .pipeline_onnx_stable_diffusion import OnnxStableDiffusionPipeline, StableDiffusionOnnxPipeline
     from .pipeline_onnx_stable_diffusion_img2img import OnnxStableDiffusionImg2ImgPipeline
diff --git a/src/diffusers/pipelines/versatile_diffusion/__init__.py b/src/diffusers/pipelines/versatile_diffusion/__init__.py
index 65bc1b72007a..ebe4343d59a0 100644
--- a/src/diffusers/pipelines/versatile_diffusion/__init__.py
+++ b/src/diffusers/pipelines/versatile_diffusion/__init__.py
@@ -1,9 +1,16 @@
-from ...utils import is_torch_available, is_transformers_available
+from ...utils import is_torch_available, is_transformers_available, is_transformers_version
 
 
-if is_transformers_available() and is_torch_available():
+if is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0"):
     from .modeling_text_unet import UNetFlatConditionModel
     from .pipeline_versatile_diffusion import VersatileDiffusionPipeline
     from .pipeline_versatile_diffusion_dual_guided import VersatileDiffusionDualGuidedPipeline
     from .pipeline_versatile_diffusion_image_variation import VersatileDiffusionImageVariationPipeline
     from .pipeline_versatile_diffusion_text_to_image import VersatileDiffusionTextToImagePipeline
+else:
+    from ...utils.dummy_torch_and_transformers_objects import (
+        VersatileDiffusionDualGuidedPipeline,
+        VersatileDiffusionImageVariationPipeline,
+        VersatileDiffusionPipeline,
+        VersatileDiffusionTextToImagePipeline,
+    )
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index 909d878ed614..e86f3b801ae4 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -33,6 +33,7 @@
     is_torch_available,
     is_torch_version,
     is_transformers_available,
+    is_transformers_version,
     is_unidecode_available,
     requires_backends,
 )
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index 005cbb6170f4..ddbd9350b60a 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -303,6 +303,17 @@ def requires_backends(obj, backends):
     if failed:
         raise ImportError("".join(failed))
 
+    if name in [
+        "VersatileDiffusionTextToImagePipeline",
+        "VersatileDiffusionPipeline",
+        "VersatileDiffusionDualGuidedPipeline",
+        "StableDiffusionImageVariationPipeline",
+    ] and is_transformers_version("<", "4.25.0"):
+        raise ImportError(
+            f"You need to install `transformers` from 'main' in order to use {name}: \n```\n pip install"
+            " git+https://github.com/huggingface/transformers \n```"
+        )
+
 
 class DummyObject(type):
     """
@@ -347,3 +358,17 @@ def is_torch_version(operation: str, version: str):
             A string version of PyTorch
     """
     return compare_versions(parse(_torch_version), operation, version)
+
+
+def is_transformers_version(operation: str, version: str):
+    """
+    Args:
+    Compares the current Transformers version to a given reference with an operation.
+        operation (`str`):
+            A string representation of an operator, such as `">"` or `"<="`
+        version (`str`):
+            A string version of PyTorch
+    """
+    if not _transformers_available:
+        return False
+    return compare_versions(parse(_transformers_version), operation, version)

From 9479052dded67788932ebce370e53d69412ea7d1 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 24 Nov 2022 00:33:32 +0100
Subject: [PATCH 240/269] fix trailing . dep object

---
 src/diffusers/pipelines/stable_diffusion/__init__.py    | 2 +-
 src/diffusers/pipelines/versatile_diffusion/__init__.py | 2 +-
 src/diffusers/utils/import_utils.py                     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
index b3f3a911b1b3..91cdab0a162e 100644
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -42,7 +42,7 @@ class StableDiffusionPipelineOutput(BaseOutput):
     from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy
     from .safety_checker import StableDiffusionSafetyChecker
 
-if is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0"):
+if is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0."):
     from .pipeline_stable_diffusion_image_variation import StableDiffusionImageVariationPipeline
 else:
     from ...utils.dummy_torch_and_transformers_objects import StableDiffusionImageVariationPipeline
diff --git a/src/diffusers/pipelines/versatile_diffusion/__init__.py b/src/diffusers/pipelines/versatile_diffusion/__init__.py
index ebe4343d59a0..7865c6283494 100644
--- a/src/diffusers/pipelines/versatile_diffusion/__init__.py
+++ b/src/diffusers/pipelines/versatile_diffusion/__init__.py
@@ -1,7 +1,7 @@
 from ...utils import is_torch_available, is_transformers_available, is_transformers_version
 
 
-if is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0"):
+if is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0."):
     from .modeling_text_unet import UNetFlatConditionModel
     from .pipeline_versatile_diffusion import VersatileDiffusionPipeline
     from .pipeline_versatile_diffusion_dual_guided import VersatileDiffusionDualGuidedPipeline
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index ddbd9350b60a..ad1e8a900253 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -308,7 +308,7 @@ def requires_backends(obj, backends):
         "VersatileDiffusionPipeline",
         "VersatileDiffusionDualGuidedPipeline",
         "StableDiffusionImageVariationPipeline",
-    ] and is_transformers_version("<", "4.25.0"):
+    ] and is_transformers_version("<", "4.25.0."):
         raise ImportError(
             f"You need to install `transformers` from 'main' in order to use {name}: \n```\n pip install"
             " git+https://github.com/huggingface/transformers \n```"

From 9f476388fae3b4552cbd98b07331d34ee573dd35 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 24 Nov 2022 00:53:57 +0100
Subject: [PATCH 241/269] trailing . fix

---
 src/diffusers/pipelines/stable_diffusion/__init__.py    | 2 +-
 src/diffusers/pipelines/versatile_diffusion/__init__.py | 2 +-
 src/diffusers/utils/import_utils.py                     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
index 91cdab0a162e..3c012dbab89d 100644
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -42,7 +42,7 @@ class StableDiffusionPipelineOutput(BaseOutput):
     from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy
     from .safety_checker import StableDiffusionSafetyChecker
 
-if is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0."):
+if is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0.dev0"):
     from .pipeline_stable_diffusion_image_variation import StableDiffusionImageVariationPipeline
 else:
     from ...utils.dummy_torch_and_transformers_objects import StableDiffusionImageVariationPipeline
diff --git a/src/diffusers/pipelines/versatile_diffusion/__init__.py b/src/diffusers/pipelines/versatile_diffusion/__init__.py
index 7865c6283494..1d2caa7e2399 100644
--- a/src/diffusers/pipelines/versatile_diffusion/__init__.py
+++ b/src/diffusers/pipelines/versatile_diffusion/__init__.py
@@ -1,7 +1,7 @@
 from ...utils import is_torch_available, is_transformers_available, is_transformers_version
 
 
-if is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0."):
+if is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0.dev0"):
     from .modeling_text_unet import UNetFlatConditionModel
     from .pipeline_versatile_diffusion import VersatileDiffusionPipeline
     from .pipeline_versatile_diffusion_dual_guided import VersatileDiffusionDualGuidedPipeline
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index ad1e8a900253..c0294b4a3d23 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -308,7 +308,7 @@ def requires_backends(obj, backends):
         "VersatileDiffusionPipeline",
         "VersatileDiffusionDualGuidedPipeline",
         "StableDiffusionImageVariationPipeline",
-    ] and is_transformers_version("<", "4.25.0."):
+    ] and is_transformers_version("<", "4.25.0.dev0"):
         raise ImportError(
             f"You need to install `transformers` from 'main' in order to use {name}: \n```\n pip install"
             " git+https://github.com/huggingface/transformers \n```"

From 30f6f4410487b6c1cf5be2da6c7e8fc844fb9a44 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Thu, 24 Nov 2022 12:25:19 +0100
Subject: [PATCH 242/269] add v prediction (#1386)

* add v prediction

* adat euler for v pred

* velocity -> v_prediction

* simplify

* fix naming

* Update src/diffusers/schedulers/scheduling_euler_discrete.py

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

* style

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 src/diffusers/schedulers/scheduling_ddim.py     | 17 ++++++++++++++++-
 .../schedulers/scheduling_euler_discrete.py     | 13 ++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 1326b503ed03..3e5ebfe0e8cd 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -122,6 +122,7 @@ def __init__(
         clip_sample: bool = True,
         set_alpha_to_one: bool = True,
         steps_offset: int = 0,
+        prediction_type: str = "epsilon",
     ):
         if trained_betas is not None:
             self.betas = torch.from_numpy(trained_betas)
@@ -138,6 +139,8 @@ def __init__(
         else:
             raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
+        self.prediction_type = prediction_type
+
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
 
@@ -258,7 +261,19 @@ def step(
 
         # 3. compute predicted original sample from predicted noise also called
         # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        if self.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        elif self.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            # predict V
+            model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction`"
+            )
 
         # 4. Clip "predicted x_0"
         if self.config.clip_sample:
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 3b2262fcc6d5..332c428c66c6 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -78,6 +78,7 @@ def __init__(
         beta_end: float = 0.02,
         beta_schedule: str = "linear",
         trained_betas: Optional[np.ndarray] = None,
+        prediction_type: str = "epsilon",
     ):
         if trained_betas is not None:
             self.betas = torch.from_numpy(trained_betas)
@@ -91,6 +92,8 @@ def __init__(
         else:
             raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
+        self.prediction_type = prediction_type
+
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
 
@@ -229,7 +232,15 @@ def step(
             sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
 
         # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
-        pred_original_sample = sample - sigma_hat * model_output
+        if self.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma_hat * model_output
+        elif self.prediction_type == "v_prediction":
+            # * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
 
         # 2. Convert to an ODE derivative
         derivative = (sample - pred_original_sample) / sigma_hat

From cecdd8bdd1c0ac902483e464f40ebdaa91f3fe13 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Thu, 24 Nov 2022 14:49:03 +0100
Subject: [PATCH 243/269] Adapt UNet2D for supre-resolution (#1385)

* allow disabling self attention

* add class_embedding

* fix copies

* fix condition

* fix copies

* do_self_attention -> only_cross_attention

* fix copies

* num_classes -> num_class_embeds

* fix default value
---
 src/diffusers/models/attention.py             | 11 ++++++++-
 src/diffusers/models/unet_2d_blocks.py        |  8 +++++++
 src/diffusers/models/unet_2d_condition.py     | 19 +++++++++++++++
 .../versatile_diffusion/modeling_text_unet.py | 23 +++++++++++++++++++
 4 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 0aacddf34d00..4c970d062d64 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -100,6 +100,7 @@ def __init__(
         activation_fn: str = "geglu",
         num_embeds_ada_norm: Optional[int] = None,
         use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
     ):
         super().__init__()
         self.use_linear_projection = use_linear_projection
@@ -157,6 +158,7 @@ def __init__(
                     activation_fn=activation_fn,
                     num_embeds_ada_norm=num_embeds_ada_norm,
                     attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
                 )
                 for d in range(num_layers)
             ]
@@ -387,14 +389,17 @@ def __init__(
         activation_fn: str = "geglu",
         num_embeds_ada_norm: Optional[int] = None,
         attention_bias: bool = False,
+        only_cross_attention: bool = False,
     ):
         super().__init__()
+        self.only_cross_attention = only_cross_attention
         self.attn1 = CrossAttention(
             query_dim=dim,
             heads=num_attention_heads,
             dim_head=attention_head_dim,
             dropout=dropout,
             bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
         )  # is a self-attention
         self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
         self.attn2 = CrossAttention(
@@ -461,7 +466,11 @@ def forward(self, hidden_states, context=None, timestep=None):
         norm_hidden_states = (
             self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(hidden_states)
         )
-        hidden_states = self.attn1(norm_hidden_states) + hidden_states
+
+        if self.only_cross_attention:
+            hidden_states = self.attn1(norm_hidden_states, context) + hidden_states
+        else:
+            hidden_states = self.attn1(norm_hidden_states) + hidden_states
 
         # 2. Cross-Attention
         norm_hidden_states = (
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index 5a8a97187f11..e919d21f4a03 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -34,6 +34,7 @@ def get_down_block(
     downsample_padding=None,
     dual_cross_attention=False,
     use_linear_projection=False,
+    only_cross_attention=False,
 ):
     down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
     if down_block_type == "DownBlock2D":
@@ -78,6 +79,7 @@ def get_down_block(
             attn_num_head_channels=attn_num_head_channels,
             dual_cross_attention=dual_cross_attention,
             use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
         )
     elif down_block_type == "SkipDownBlock2D":
         return SkipDownBlock2D(
@@ -143,6 +145,7 @@ def get_up_block(
     cross_attention_dim=None,
     dual_cross_attention=False,
     use_linear_projection=False,
+    only_cross_attention=False,
 ):
     up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
     if up_block_type == "UpBlock2D":
@@ -174,6 +177,7 @@ def get_up_block(
             attn_num_head_channels=attn_num_head_channels,
             dual_cross_attention=dual_cross_attention,
             use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
         )
     elif up_block_type == "AttnUpBlock2D":
         return AttnUpBlock2D(
@@ -530,6 +534,7 @@ def __init__(
         add_downsample=True,
         dual_cross_attention=False,
         use_linear_projection=False,
+        only_cross_attention=False,
     ):
         super().__init__()
         resnets = []
@@ -564,6 +569,7 @@ def __init__(
                         cross_attention_dim=cross_attention_dim,
                         norm_num_groups=resnet_groups,
                         use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
                     )
                 )
             else:
@@ -1129,6 +1135,7 @@ def __init__(
         add_upsample=True,
         dual_cross_attention=False,
         use_linear_projection=False,
+        only_cross_attention=False,
     ):
         super().__init__()
         resnets = []
@@ -1165,6 +1172,7 @@ def __init__(
                         cross_attention_dim=cross_attention_dim,
                         norm_num_groups=resnet_groups,
                         use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
                     )
                 )
             else:
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 206097149331..97a26ced5400 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -98,6 +98,7 @@ def __init__(
             "DownBlock2D",
         ),
         up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
         block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
         layers_per_block: int = 2,
         downsample_padding: int = 1,
@@ -109,6 +110,7 @@ def __init__(
         attention_head_dim: Union[int, Tuple[int]] = 8,
         dual_cross_attention: bool = False,
         use_linear_projection: bool = False,
+        num_class_embeds: Optional[int] = None,
     ):
         super().__init__()
 
@@ -124,10 +126,17 @@ def __init__(
 
         self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
 
+        # class embedding
+        if num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+
         self.down_blocks = nn.ModuleList([])
         self.mid_block = None
         self.up_blocks = nn.ModuleList([])
 
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+
         if isinstance(attention_head_dim, int):
             attention_head_dim = (attention_head_dim,) * len(down_block_types)
 
@@ -153,6 +162,7 @@ def __init__(
                 downsample_padding=downsample_padding,
                 dual_cross_attention=dual_cross_attention,
                 use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
             )
             self.down_blocks.append(down_block)
 
@@ -177,6 +187,7 @@ def __init__(
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
         reversed_attention_head_dim = list(reversed(attention_head_dim))
+        only_cross_attention = list(reversed(only_cross_attention))
         output_channel = reversed_block_out_channels[0]
         for i, up_block_type in enumerate(up_block_types):
             is_final_block = i == len(block_out_channels) - 1
@@ -207,6 +218,7 @@ def __init__(
                 attn_num_head_channels=reversed_attention_head_dim[i],
                 dual_cross_attention=dual_cross_attention,
                 use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
@@ -258,6 +270,7 @@ def forward(
         sample: torch.FloatTensor,
         timestep: Union[torch.Tensor, float, int],
         encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
         return_dict: bool = True,
     ) -> Union[UNet2DConditionOutput, Tuple]:
         r"""
@@ -310,6 +323,12 @@ def forward(
         t_emb = t_emb.to(dtype=self.dtype)
         emb = self.time_embedding(t_emb)
 
+        if self.config.num_class_embeds is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+
         # 2. pre-process
         sample = self.conv_in(sample)
 
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 6d521228e31b..24e79729a599 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -166,6 +166,7 @@ def __init__(
             "CrossAttnUpBlockFlat",
             "CrossAttnUpBlockFlat",
         ),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
         block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
         layers_per_block: int = 2,
         downsample_padding: int = 1,
@@ -177,6 +178,7 @@ def __init__(
         attention_head_dim: Union[int, Tuple[int]] = 8,
         dual_cross_attention: bool = False,
         use_linear_projection: bool = False,
+        num_class_embeds: Optional[int] = None,
     ):
         super().__init__()
 
@@ -192,10 +194,17 @@ def __init__(
 
         self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
 
+        # class embedding
+        if num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+
         self.down_blocks = nn.ModuleList([])
         self.mid_block = None
         self.up_blocks = nn.ModuleList([])
 
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+
         if isinstance(attention_head_dim, int):
             attention_head_dim = (attention_head_dim,) * len(down_block_types)
 
@@ -221,6 +230,7 @@ def __init__(
                 downsample_padding=downsample_padding,
                 dual_cross_attention=dual_cross_attention,
                 use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
             )
             self.down_blocks.append(down_block)
 
@@ -245,6 +255,7 @@ def __init__(
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
         reversed_attention_head_dim = list(reversed(attention_head_dim))
+        only_cross_attention = list(reversed(only_cross_attention))
         output_channel = reversed_block_out_channels[0]
         for i, up_block_type in enumerate(up_block_types):
             is_final_block = i == len(block_out_channels) - 1
@@ -275,6 +286,7 @@ def __init__(
                 attn_num_head_channels=reversed_attention_head_dim[i],
                 dual_cross_attention=dual_cross_attention,
                 use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
@@ -326,6 +338,7 @@ def forward(
         sample: torch.FloatTensor,
         timestep: Union[torch.Tensor, float, int],
         encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
         return_dict: bool = True,
     ) -> Union[UNet2DConditionOutput, Tuple]:
         r"""
@@ -378,6 +391,12 @@ def forward(
         t_emb = t_emb.to(dtype=self.dtype)
         emb = self.time_embedding(t_emb)
 
+        if self.config.num_class_embeds is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+
         # 2. pre-process
         sample = self.conv_in(sample)
 
@@ -648,6 +667,7 @@ def __init__(
         add_downsample=True,
         dual_cross_attention=False,
         use_linear_projection=False,
+        only_cross_attention=False,
     ):
         super().__init__()
         resnets = []
@@ -682,6 +702,7 @@ def __init__(
                         cross_attention_dim=cross_attention_dim,
                         norm_num_groups=resnet_groups,
                         use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
                     )
                 )
             else:
@@ -861,6 +882,7 @@ def __init__(
         add_upsample=True,
         dual_cross_attention=False,
         use_linear_projection=False,
+        only_cross_attention=False,
     ):
         super().__init__()
         resnets = []
@@ -897,6 +919,7 @@ def __init__(
                         cross_attention_dim=cross_attention_dim,
                         norm_num_groups=resnet_groups,
                         use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
                     )
                 )
             else:

From 81d8f4a9e157fd247addb815433cc8d9b5e59e35 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Thu, 24 Nov 2022 14:54:29 +0100
Subject: [PATCH 244/269] Version 0.9.0.dev0 (#1394)

---
 setup.py                  | 2 +-
 src/diffusers/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index a4c336669e45..c6f2725be1e1 100644
--- a/setup.py
+++ b/setup.py
@@ -212,7 +212,7 @@ def run(self):
 
 setup(
     name="diffusers",
-    version="0.8.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="0.9.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     description="Diffusers",
     long_description=open("README.md", "r", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index a1faa28000af..4a6661b6b393 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -9,7 +9,7 @@
 )
 
 
-__version__ = "0.8.0"
+__version__ = "0.9.0.dev0"
 
 from .configuration_utils import ConfigMixin
 from .onnx_utils import OnnxRuntimeModel

From e0e86b74709e8671bad028974fe2d6b5c271da02 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 24 Nov 2022 18:23:59 +0100
Subject: [PATCH 245/269] Make height and width optional (#1401)

* fix

* add test

* fix test

* uP

* up

* fix some tests
---
 .../alt_diffusion/pipeline_alt_diffusion.py   | 11 ++--
 .../pipeline_latent_diffusion.py              | 11 ++--
 .../pipeline_flax_stable_diffusion.py         | 20 ++++--
 .../pipeline_onnx_stable_diffusion.py         |  8 ++-
 .../pipeline_onnx_stable_diffusion_inpaint.py | 12 ++--
 .../pipeline_stable_diffusion.py              | 11 ++--
 ...peline_stable_diffusion_image_variation.py | 11 ++--
 .../pipeline_stable_diffusion_inpaint.py      | 11 ++--
 .../pipeline_stable_diffusion_safe.py         | 11 ++--
 .../pipeline_versatile_diffusion.py           | 26 ++++----
 ...ipeline_versatile_diffusion_dual_guided.py | 11 ++--
 ...ine_versatile_diffusion_image_variation.py | 11 ++--
 ...eline_versatile_diffusion_text_to_image.py | 11 ++--
 .../altdiffusion/test_alt_diffusion.py        | 14 ++--
 .../stable_diffusion/test_stable_diffusion.py | 64 +++++++++++++++----
 .../test_stable_diffusion_image_variation.py  | 14 ++--
 .../test_stable_diffusion_inpaint.py          |  4 +-
 .../test_safe_diffusion.py                    |  6 +-
 tests/test_pipelines.py                       |  2 +-
 tests/test_pipelines_flax.py                  |  2 +-
 20 files changed, 176 insertions(+), 95 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index 246f2b8720b6..f9458b6bf803 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -390,8 +390,8 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
     def __call__(
         self,
         prompt: Union[str, List[str]],
-        height: int = 512,
-        width: int = 512,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -411,9 +411,9 @@ def __call__(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 512):
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -459,6 +459,9 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * 8
+        width = width or self.unet.config.sample_size * 8
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(prompt, height, width, callback_steps)
diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
index feb5b00d7438..a52f19ca3178 100644
--- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
@@ -65,8 +65,8 @@ def __init__(
     def __call__(
         self,
         prompt: Union[str, List[str]],
-        height: Optional[int] = 256,
-        width: Optional[int] = 256,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: Optional[int] = 50,
         guidance_scale: Optional[float] = 1.0,
         eta: Optional[float] = 0.0,
@@ -79,9 +79,9 @@ def __call__(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 256):
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 256):
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -106,6 +106,9 @@ def __call__(
             `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
             generated images.
         """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * 8
+        width = width or self.unet.config.sample_size * 8
 
         if isinstance(prompt, str):
             batch_size = 1
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
index 9c668d5e51d5..8c2ed06e4966 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
@@ -160,13 +160,17 @@ def _generate(
         params: Union[Dict, FrozenDict],
         prng_seed: jax.random.PRNGKey,
         num_inference_steps: int = 50,
-        height: int = 512,
-        width: int = 512,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         guidance_scale: float = 7.5,
         latents: Optional[jnp.array] = None,
         debug: bool = False,
         neg_prompt_ids: jnp.array = None,
     ):
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * 8
+        width = width or self.unet.config.sample_size * 8
+
         if height % 8 != 0 or width % 8 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
@@ -249,8 +253,8 @@ def __call__(
         params: Union[Dict, FrozenDict],
         prng_seed: jax.random.PRNGKey,
         num_inference_steps: int = 50,
-        height: int = 512,
-        width: int = 512,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         guidance_scale: float = 7.5,
         latents: jnp.array = None,
         return_dict: bool = True,
@@ -265,9 +269,9 @@ def __call__(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 512):
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -302,6 +306,10 @@ def __call__(
             element is a list of `bool`s denoting whether the corresponding generated image likely represents
             "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
         """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * 8
+        width = width or self.unet.config.sample_size * 8
+
         if jit:
             images = _p_generate(
                 self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
index 9830ace6a167..f64bd4340bac 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
@@ -172,8 +172,8 @@ def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guida
     def __call__(
         self,
         prompt: Union[str, List[str]],
-        height: Optional[int] = 512,
-        width: Optional[int] = 512,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: Optional[int] = 50,
         guidance_scale: Optional[float] = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -187,6 +187,10 @@ def __call__(
         callback_steps: Optional[int] = 1,
         **kwargs,
     ):
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * 8
+        width = width or self.unet.config.sample_size * 8
+
         if isinstance(prompt, str):
             batch_size = 1
         elif isinstance(prompt, list):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
index b933c52bf6c4..bbb193a76778 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -236,8 +236,8 @@ def __call__(
         prompt: Union[str, List[str]],
         image: PIL.Image.Image,
         mask_image: PIL.Image.Image,
-        height: int = 512,
-        width: int = 512,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -265,9 +265,9 @@ def __call__(
                 repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
                 to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
                 instead of 3, so the expected shape would be `(B, H, W, 1)`.
-            height (`int`, *optional*, defaults to 512):
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -312,6 +312,10 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * 8
+        width = width or self.unet.config.sample_size * 8
+
         if isinstance(prompt, str):
             batch_size = 1
         elif isinstance(prompt, list):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index fbfac6b5a0f4..5f3cc41b65a4 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -389,8 +389,8 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
     def __call__(
         self,
         prompt: Union[str, List[str]],
-        height: int = 512,
-        width: int = 512,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -410,9 +410,9 @@ def __call__(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 512):
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -458,6 +458,9 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * 8
+        width = width or self.unet.config.sample_size * 8
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(prompt, height, width, callback_steps)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
index 4cfa5817af03..f23161c51c3c 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -292,8 +292,8 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
     def __call__(
         self,
         image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
-        height: int = 512,
-        width: int = 512,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
         num_images_per_prompt: Optional[int] = 1,
@@ -315,9 +315,9 @@ def __call__(
                 configuration of
                 [this](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json)
                 `CLIPFeatureExtractor`
-            height (`int`, *optional*, defaults to 512):
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -360,6 +360,9 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * 8
+        width = width or self.unet.config.sample_size * 8
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(image, height, width, callback_steps)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 9eb8de248293..544f7583981e 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -509,8 +509,8 @@ def __call__(
         prompt: Union[str, List[str]],
         image: Union[torch.FloatTensor, PIL.Image.Image],
         mask_image: Union[torch.FloatTensor, PIL.Image.Image],
-        height: int = 512,
-        width: int = 512,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -538,9 +538,9 @@ def __call__(
                 repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
                 to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
                 instead of 3, so the expected shape would be `(B, H, W, 1)`.
-            height (`int`, *optional*, defaults to 512):
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -586,6 +586,9 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * 8
+        width = width or self.unet.config.sample_size * 8
 
         # 1. Check inputs
         self.check_inputs(prompt, height, width, callback_steps)
diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
index cfa71b9242b0..1be95418d3e2 100644
--- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
+++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
@@ -495,8 +495,8 @@ def perform_safety_guidance(
     def __call__(
         self,
         prompt: Union[str, List[str]],
-        height: int = 512,
-        width: int = 512,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -521,9 +521,9 @@ def __call__(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 512):
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -589,6 +589,9 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * 8
+        width = width or self.unet.config.sample_size * 8
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(prompt, height, width, callback_steps)
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
index 1280419c343d..c60a8836ec5c 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
@@ -111,8 +111,8 @@ def disable_attention_slicing(self):
     def image_variation(
         self,
         image: Union[torch.FloatTensor, PIL.Image.Image],
-        height: int = 512,
-        width: int = 512,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -131,9 +131,9 @@ def image_variation(
         Args:
             image (`PIL.Image.Image`, `List[PIL.Image.Image]` or `torch.Tensor`):
                 The image prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 512):
+            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
+            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -193,7 +193,7 @@ def image_variation(
         >>> pipe = pipe.to("cuda")
 
         >>> generator = torch.Generator(device="cuda").manual_seed(0)
-        >>> image = pipe(image, generator=generator).images[0]
+        >>> image = pipe.image_variation(image, generator=generator).images[0]
         >>> image.save("./car_variation.png")
         ```
 
@@ -227,8 +227,8 @@ def image_variation(
     def text_to_image(
         self,
         prompt: Union[str, List[str]],
-        height: int = 512,
-        width: int = 512,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -247,9 +247,9 @@ def text_to_image(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 512):
+            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
+            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -341,8 +341,8 @@ def dual_guided(
         prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
         image: Union[str, List[str]],
         text_to_image_strength: float = 0.5,
-        height: int = 512,
-        width: int = 512,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
         num_images_per_prompt: Optional[int] = 1,
@@ -360,9 +360,9 @@ def dual_guided(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 512):
+            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
+            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
index ad4e8b0d0aa2..e5dc59389f0a 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
@@ -454,8 +454,8 @@ def __call__(
         prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
         image: Union[str, List[str]],
         text_to_image_strength: float = 0.5,
-        height: int = 512,
-        width: int = 512,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
         num_images_per_prompt: Optional[int] = 1,
@@ -474,9 +474,9 @@ def __call__(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 512):
+            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
+            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -551,6 +551,9 @@ def __call__(
             [`~pipelines.stable_diffusion.ImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
             returning a tuple, the first element is a list with the generated images.
         """
+        # 0. Default height and width to unet
+        height = height or self.image_unet.config.sample_size * 8
+        width = width or self.image_unet.config.sample_size * 8
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(prompt, image, height, width, callback_steps)
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
index 652b7b735af4..53c67d8c2e85 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
@@ -297,8 +297,8 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
     def __call__(
         self,
         image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor],
-        height: int = 512,
-        width: int = 512,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -318,9 +318,9 @@ def __call__(
         Args:
             image (`PIL.Image.Image`, `List[PIL.Image.Image]` or `torch.Tensor`):
                 The image prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 512):
+            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
+            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -391,6 +391,9 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
+        # 0. Default height and width to unet
+        height = height or self.image_unet.config.sample_size * 8
+        width = width or self.image_unet.config.sample_size * 8
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(image, height, width, callback_steps)
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
index d07d734a642f..f38b604bd648 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
@@ -357,8 +357,8 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
     def __call__(
         self,
         prompt: Union[str, List[str]],
-        height: int = 512,
-        width: int = 512,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -378,9 +378,9 @@ def __call__(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to 512):
+            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to 512):
+            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -443,6 +443,9 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
+        # 0. Default height and width to unet
+        height = height or self.image_unet.config.sample_size * 8
+        width = width or self.image_unet.config.sample_size * 8
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(prompt, height, width, callback_steps)
diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion.py b/tests/pipelines/altdiffusion/test_alt_diffusion.py
index b743d100ce73..bcbadfb3dbd4 100644
--- a/tests/pipelines/altdiffusion/test_alt_diffusion.py
+++ b/tests/pipelines/altdiffusion/test_alt_diffusion.py
@@ -171,10 +171,8 @@ def test_alt_diffusion_ddim(self):
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
-        assert image.shape == (1, 128, 128, 3)
-        expected_slice = np.array(
-            [0.49249017, 0.46064827, 0.4790093, 0.50883967, 0.4811985, 0.51540506, 0.5084924, 0.4860553, 0.47318557]
-        )
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5748162, 0.60447145, 0.48821217, 0.50100636, 0.5431185, 0.45763683, 0.49657696, 0.48132733, 0.47573093])
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
@@ -220,10 +218,8 @@ def test_alt_diffusion_pndm(self):
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
-        assert image.shape == (1, 128, 128, 3)
-        expected_slice = np.array(
-            [0.4786532, 0.45791715, 0.47507674, 0.50763345, 0.48375353, 0.515062, 0.51244247, 0.48673993, 0.47105807]
-        )
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.51605093, 0.5707241, 0.47365507, 0.50578886, 0.5633877, 0.4642503, 0.5182081, 0.48763484, 0.49084237])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
@@ -259,7 +255,7 @@ def test_alt_diffusion_fp16(self):
         generator = torch.Generator(device=torch_device).manual_seed(0)
         image = alt_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images
 
-        assert image.shape == (1, 128, 128, 3)
+        assert image.shape == (1, 64, 64, 3)
 
 
 @slow
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 17a293e605fd..53f8024f6cc8 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -207,9 +207,10 @@ def test_stable_diffusion_ddim(self):
         )[0]
 
         image_slice = image[0, -3:, -3:, -1]
+        print(", ".join(image_slice.flatten().tolist()))
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
-        assert image.shape == (1, 128, 128, 3)
+        assert image.shape == (1, 64, 64, 3)
         expected_slice = np.array([0.5112, 0.4692, 0.4715, 0.5206, 0.4894, 0.5114, 0.5096, 0.4932, 0.4755])
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@@ -302,9 +303,10 @@ def test_stable_diffusion_pndm(self):
         )[0]
 
         image_slice = image[0, -3:, -3:, -1]
+        print(", ".join(image_slice.flatten().tolist()))
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
-        assert image.shape == (1, 128, 128, 3)
+        assert image.shape == (1, 64, 64, 3)
         expected_slice = np.array([0.4937, 0.4649, 0.4716, 0.5145, 0.4889, 0.513, 0.513, 0.4905, 0.4738])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
@@ -368,9 +370,10 @@ def test_stable_diffusion_k_lms(self):
         )[0]
 
         image_slice = image[0, -3:, -3:, -1]
+        print(", ".join(image_slice.flatten().tolist()))
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
-        assert image.shape == (1, 128, 128, 3)
+        assert image.shape == (1, 64, 64, 3)
         expected_slice = np.array([0.5067, 0.4689, 0.4614, 0.5233, 0.4903, 0.5112, 0.524, 0.5069, 0.4785])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
@@ -413,9 +416,10 @@ def test_stable_diffusion_k_euler_ancestral(self):
         )[0]
 
         image_slice = image[0, -3:, -3:, -1]
+        print(", ".join(image_slice.flatten().tolist()))
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
-        assert image.shape == (1, 128, 128, 3)
+        assert image.shape == (1, 64, 64, 3)
         expected_slice = np.array([0.5067, 0.4689, 0.4614, 0.5233, 0.4903, 0.5112, 0.524, 0.5069, 0.4785])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
@@ -458,9 +462,10 @@ def test_stable_diffusion_k_euler(self):
         )[0]
 
         image_slice = image[0, -3:, -3:, -1]
+        print(", ".join(image_slice.flatten().tolist()))
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
-        assert image.shape == (1, 128, 128, 3)
+        assert image.shape == (1, 64, 64, 3)
         expected_slice = np.array([0.5067, 0.4689, 0.4614, 0.5233, 0.4903, 0.5112, 0.524, 0.5069, 0.4785])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
@@ -533,7 +538,7 @@ def test_stable_diffusion_negative_prompt(self):
         image = output.images
         image_slice = image[0, -3:, -3:, -1]
 
-        assert image.shape == (1, 128, 128, 3)
+        assert image.shape == (1, 64, 64, 3)
         expected_slice = np.array([0.4851, 0.4617, 0.4765, 0.5127, 0.4845, 0.5153, 0.5141, 0.4886, 0.4719])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
@@ -563,13 +568,13 @@ def test_stable_diffusion_num_images_per_prompt(self):
         # test num_images_per_prompt=1 (default)
         images = sd_pipe(prompt, num_inference_steps=2, output_type="np").images
 
-        assert images.shape == (1, 128, 128, 3)
+        assert images.shape == (1, 64, 64, 3)
 
         # test num_images_per_prompt=1 (default) for batch of prompts
         batch_size = 2
         images = sd_pipe([prompt] * batch_size, num_inference_steps=2, output_type="np").images
 
-        assert images.shape == (batch_size, 128, 128, 3)
+        assert images.shape == (batch_size, 64, 64, 3)
 
         # test num_images_per_prompt for single prompt
         num_images_per_prompt = 2
@@ -577,7 +582,7 @@ def test_stable_diffusion_num_images_per_prompt(self):
             prompt, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
         ).images
 
-        assert images.shape == (num_images_per_prompt, 128, 128, 3)
+        assert images.shape == (num_images_per_prompt, 64, 64, 3)
 
         # test num_images_per_prompt for batch of prompts
         batch_size = 2
@@ -585,7 +590,7 @@ def test_stable_diffusion_num_images_per_prompt(self):
             [prompt] * batch_size, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
         ).images
 
-        assert images.shape == (batch_size * num_images_per_prompt, 128, 128, 3)
+        assert images.shape == (batch_size * num_images_per_prompt, 64, 64, 3)
 
     @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
     def test_stable_diffusion_fp16(self):
@@ -618,7 +623,7 @@ def test_stable_diffusion_fp16(self):
         generator = torch.Generator(device=torch_device).manual_seed(0)
         image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images
 
-        assert image.shape == (1, 128, 128, 3)
+        assert image.shape == (1, 64, 64, 3)
 
     def test_stable_diffusion_long_prompt(self):
         unet = self.dummy_cond_unet
@@ -671,6 +676,43 @@ def test_stable_diffusion_long_prompt(self):
         assert cap_logger.out.count("@") == 25
         assert cap_logger_3.out == ""
 
+    def test_stable_diffusion_height_width_opt(self):
+        unet = self.dummy_cond_unet
+        scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "hey"
+
+        output = sd_pipe(prompt, number_of_steps=2, output_type="np")
+        image_shape = output.images[0].shape[:2]
+        assert image_shape == [32, 32]
+
+        output = sd_pipe(prompt, number_of_steps=2, height=64, width=64, output_type="np")
+        image_shape = output.images[0].shape[:2]
+        assert image_shape == [64, 64]
+
+        config = dict(sd_pipe.unet.config)
+        config["sample_size"] = 96
+        sd_pipe.unet = UNet2DConditionModel.from_config(config)
+        output = sd_pipe(prompt, number_of_steps=2, output_type="np")
+        image_shape = output.images[0].shape[:2]
+        assert image_shape == [96, 96]
+
 
 @slow
 @require_torch_gpu
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
index 2935275d0f91..a9923089223c 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
@@ -157,7 +157,7 @@ def test_stable_diffusion_img_variation_default_case(self):
         print(image_slice.flatten())
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
-        assert image.shape == (1, 128, 128, 3)
+        assert image.shape == (1, 64, 64, 3)
         expected_slice = np.array([0.4935, 0.4784, 0.4802, 0.5027, 0.4805, 0.5149, 0.5143, 0.4879, 0.4731])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-3
@@ -196,7 +196,7 @@ def test_stable_diffusion_img_variation_multiple_images(self):
 
         image_slice = image[-1, -3:, -3:, -1]
 
-        assert image.shape == (2, 128, 128, 3)
+        assert image.shape == (2, 64, 64, 3)
         expected_slice = np.array([0.4939, 0.4627, 0.4831, 0.5710, 0.5387, 0.4428, 0.5230, 0.5545, 0.4586])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
@@ -228,7 +228,7 @@ def test_stable_diffusion_img_variation_num_images_per_prompt(self):
             output_type="np",
         ).images
 
-        assert images.shape == (1, 128, 128, 3)
+        assert images.shape == (1, 64, 64, 3)
 
         # test num_images_per_prompt=1 (default) for batch of images
         batch_size = 2
@@ -238,7 +238,7 @@ def test_stable_diffusion_img_variation_num_images_per_prompt(self):
             output_type="np",
         ).images
 
-        assert images.shape == (batch_size, 128, 128, 3)
+        assert images.shape == (batch_size, 64, 64, 3)
 
         # test num_images_per_prompt for single prompt
         num_images_per_prompt = 2
@@ -249,7 +249,7 @@ def test_stable_diffusion_img_variation_num_images_per_prompt(self):
             num_images_per_prompt=num_images_per_prompt,
         ).images
 
-        assert images.shape == (num_images_per_prompt, 128, 128, 3)
+        assert images.shape == (num_images_per_prompt, 64, 64, 3)
 
         # test num_images_per_prompt for batch of prompts
         batch_size = 2
@@ -260,7 +260,7 @@ def test_stable_diffusion_img_variation_num_images_per_prompt(self):
             num_images_per_prompt=num_images_per_prompt,
         ).images
 
-        assert images.shape == (batch_size * num_images_per_prompt, 128, 128, 3)
+        assert images.shape == (batch_size * num_images_per_prompt, 64, 64, 3)
 
     @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
     def test_stable_diffusion_img_variation_fp16(self):
@@ -297,7 +297,7 @@ def test_stable_diffusion_img_variation_fp16(self):
             output_type="np",
         ).images
 
-        assert image.shape == (1, 128, 128, 3)
+        assert image.shape == (1, 64, 64, 3)
 
 
 @slow
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 2f9348c5b566..0d8abe5394a1 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -212,7 +212,7 @@ def test_stable_diffusion_inpaint(self):
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
-        assert image.shape == (1, 128, 128, 3)
+        assert image.shape == (1, 64, 64, 3)
         expected_slice = np.array([0.5075, 0.4485, 0.4558, 0.5369, 0.5369, 0.5236, 0.5127, 0.4983, 0.4776])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
@@ -300,7 +300,7 @@ def test_stable_diffusion_inpaint_fp16(self):
             mask_image=mask_image,
         ).images
 
-        assert image.shape == (1, 128, 128, 3)
+        assert image.shape == (1, 64, 64, 3)
 
 
 @slow
diff --git a/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py b/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
index dcb3f2730360..d80275c25794 100644
--- a/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
+++ b/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
@@ -155,7 +155,7 @@ def test_safe_diffusion_ddim(self):
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
-        assert image.shape == (1, 128, 128, 3)
+        assert image.shape == (1, 64, 64, 3)
         expected_slice = np.array([0.5112, 0.4692, 0.4715, 0.5206, 0.4894, 0.5114, 0.5096, 0.4932, 0.4755])
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@@ -201,7 +201,7 @@ def test_stable_diffusion_pndm(self):
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
-        assert image.shape == (1, 128, 128, 3)
+        assert image.shape == (1, 64, 64, 3)
         expected_slice = np.array([0.4937, 0.4649, 0.4716, 0.5145, 0.4889, 0.513, 0.513, 0.4905, 0.4738])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
@@ -258,7 +258,7 @@ def test_stable_diffusion_fp16(self):
         generator = torch.Generator(device=torch_device).manual_seed(0)
         image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images
 
-        assert image.shape == (1, 128, 128, 3)
+        assert image.shape == (1, 64, 64, 3)
 
 
 @slow
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 19493e323149..f4b81b54c845 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -436,7 +436,7 @@ def test_stable_diffusion_components(self):
 
         assert image_inpaint.shape == (1, 32, 32, 3)
         assert image_img2img.shape == (1, 32, 32, 3)
-        assert image_text2img.shape == (1, 128, 128, 3)
+        assert image_text2img.shape == (1, 64, 64, 3)
 
     def test_set_scheduler(self):
         unet = self.dummy_cond_unet
diff --git a/tests/test_pipelines_flax.py b/tests/test_pipelines_flax.py
index 72316aad9211..9b9dcddd6060 100644
--- a/tests/test_pipelines_flax.py
+++ b/tests/test_pipelines_flax.py
@@ -78,7 +78,7 @@ def test_dummy_all_tpus(self):
 
         images = p_sample(prompt_ids, params, prng_seed, num_inference_steps).images
 
-        assert images.shape == (num_samples, 1, 128, 128, 3)
+        assert images.shape == (num_samples, 1, 64, 64, 3)
         if jax.device_count() == 8:
             assert np.abs(np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 3.1111548) < 1e-3
             assert np.abs(np.abs(images, dtype=np.float32).sum() - 199746.95) < 5e-1

From cbfed0c25606a70b5c4faf819575b46267434847 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 24 Nov 2022 20:05:41 +0100
Subject: [PATCH 246/269] [Config] Add optional arguments (#1395)

* Optional Components

* uP

* finish

* finish

* finish

* Apply suggestions from code review

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>

* up

* Update src/diffusers/pipeline_utils.py

* improve

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
---
 src/diffusers/pipeline_utils.py               |  61 +++++---
 .../alt_diffusion/pipeline_alt_diffusion.py   |  11 +-
 .../pipeline_alt_diffusion_img2img.py         |  11 +-
 .../pipeline_cycle_diffusion.py               |  11 +-
 .../pipeline_onnx_stable_diffusion.py         |  18 +++
 .../pipeline_onnx_stable_diffusion_img2img.py |  10 +-
 .../pipeline_onnx_stable_diffusion_inpaint.py |  10 +-
 ...ne_onnx_stable_diffusion_inpaint_legacy.py |  10 +-
 .../pipeline_stable_diffusion.py              |  11 +-
 ...peline_stable_diffusion_image_variation.py |  11 +-
 .../pipeline_stable_diffusion_img2img.py      |  11 +-
 .../pipeline_stable_diffusion_inpaint.py      |  11 +-
 ...ipeline_stable_diffusion_inpaint_legacy.py |  11 +-
 .../pipeline_stable_diffusion_safe.py         |  12 +-
 tests/test_pipelines.py                       | 146 ++++++++++++++----
 15 files changed, 292 insertions(+), 63 deletions(-)

diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 3f2857fa4f56..224d6a7f8e8a 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -129,10 +129,13 @@ class DiffusionPipeline(ConfigMixin):
 
     Class attributes:
 
-        - **config_name** ([`str`]) -- name of the config file that will store the class and module names of all
+        - **config_name** (`str`) -- name of the config file that will store the class and module names of all
           components of the diffusion pipeline.
+        - **_optional_components** (List[`str`]) -- list of all components that are optional so they don't have to be
+          passed for the pipeline to function (should be overridden by subclasses).
     """
     config_name = "model_index.json"
+    _optional_components = []
 
     def register_modules(self, **kwargs):
         # import it here to avoid circular import
@@ -184,12 +187,19 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike]):
         model_index_dict.pop("_diffusers_version")
         model_index_dict.pop("_module", None)
 
+        expected_modules, optional_kwargs = self._get_signature_keys(self)
+
+        def is_saveable_module(name, value):
+            if name not in expected_modules:
+                return False
+            if name in self._optional_components and value[0] is None:
+                return False
+            return True
+
+        model_index_dict = {k: v for k, v in model_index_dict.items() if is_saveable_module(k, v)}
+
         for pipeline_component_name in model_index_dict.keys():
             sub_model = getattr(self, pipeline_component_name)
-            if sub_model is None:
-                # edge case for saving a pipeline with safety_checker=None
-                continue
-
             model_cls = sub_model.__class__
 
             save_method_name = None
@@ -523,26 +533,27 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         # some modules can be passed directly to the init
         # in this case they are already instantiated in `kwargs`
         # extract them here
-        expected_modules = set(inspect.signature(pipeline_class.__init__).parameters.keys()) - set(["self"])
+        expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class)
         passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
 
         init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
 
+        # define init kwargs
+        init_kwargs = {k: init_dict.pop(k) for k in optional_kwargs if k in init_dict}
+        init_kwargs = {**init_kwargs, **passed_pipe_kwargs}
+
+        # remove `null` components
+        init_dict = {k: v for k, v in init_dict.items() if v[0] is not None}
+
         if len(unused_kwargs) > 0:
             logger.warning(f"Keyword arguments {unused_kwargs} not recognized.")
 
-        init_kwargs = {}
-
         # import it here to avoid circular import
         from diffusers import pipelines
 
         # 3. Load each module in the pipeline
         for name, (library_name, class_name) in init_dict.items():
-            if class_name is None:
-                # edge case for when the pipeline was saved with safety_checker=None
-                init_kwargs[name] = None
-                continue
-
             # 3.1 - now that JAX/Flax is an official framework of the library, we might load from Flax names
             if class_name.startswith("Flax"):
                 class_name = class_name[4:]
@@ -570,7 +581,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                             f"{passed_class_obj[name]} is of type: {type(passed_class_obj[name])}, but should be"
                             f" {expected_class_obj}"
                         )
-                elif passed_class_obj[name] is None:
+                elif passed_class_obj[name] is None and name not in pipeline_class._optional_components:
                     logger.warning(
                         f"You have passed `None` for {name} to disable its functionality in {pipeline_class}. Note"
                         f" that this might lead to problems when using {pipeline_class} and is not recommended."
@@ -651,11 +662,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
         # 4. Potentially add passed objects if expected
         missing_modules = set(expected_modules) - set(init_kwargs.keys())
-        if len(missing_modules) > 0 and missing_modules <= set(passed_class_obj.keys()):
+        passed_modules = list(passed_class_obj.keys())
+        optional_modules = pipeline_class._optional_components
+        if len(missing_modules) > 0 and missing_modules <= set(passed_modules + optional_modules):
             for module in missing_modules:
-                init_kwargs[module] = passed_class_obj[module]
+                init_kwargs[module] = passed_class_obj.get(module, None)
         elif len(missing_modules) > 0:
-            passed_modules = set(list(init_kwargs.keys()) + list(passed_class_obj.keys()))
+            passed_modules = set(list(init_kwargs.keys()) + list(passed_class_obj.keys())) - optional_kwargs
             raise ValueError(
                 f"Pipeline {pipeline_class} expected {expected_modules}, but only {passed_modules} were passed."
             )
@@ -664,6 +677,14 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         model = pipeline_class(**init_kwargs)
         return model
 
+    @staticmethod
+    def _get_signature_keys(obj):
+        parameters = inspect.signature(obj.__init__).parameters
+        required_parameters = {k: v for k, v in parameters.items() if v.default is not True}
+        optional_parameters = set({k for k, v in parameters.items() if v.default is True})
+        expected_modules = set(required_parameters.keys()) - set(["self"])
+        return expected_modules, optional_parameters
+
     @property
     def components(self) -> Dict[str, Any]:
         r"""
@@ -688,8 +709,10 @@ def components(self) -> Dict[str, Any]:
         Returns:
             A dictionaly containing all the modules needed to initialize the pipeline.
         """
-        components = {k: getattr(self, k) for k in self.config.keys() if not k.startswith("_")}
-        expected_modules = set(inspect.signature(self.__init__).parameters.keys()) - set(["self"])
+        expected_modules, optional_parameters = self._get_signature_keys(self)
+        components = {
+            k: getattr(self, k) for k in self.config.keys() if not k.startswith("_") and k not in optional_parameters
+        }
 
         if set(components.keys()) != expected_modules:
             raise ValueError(
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index f9458b6bf803..718b6b652aaa 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -67,6 +67,7 @@ class AltDiffusionPipeline(DiffusionPipeline):
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
+    _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
         self,
@@ -84,6 +85,7 @@ def __init__(
         ],
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
     ):
         super().__init__()
 
@@ -114,7 +116,7 @@ def __init__(
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if safety_checker is None:
+        if safety_checker is None and requires_safety_checker:
             logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Alt Diffusion license and do not expose unfiltered"
@@ -124,6 +126,12 @@ def __init__(
                 " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
             )
 
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -133,6 +141,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     def enable_xformers_memory_efficient_attention(self):
         r"""
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 7fc1658ea078..ba4462bc2049 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -80,6 +80,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
+    _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
         self,
@@ -97,6 +98,7 @@ def __init__(
         ],
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
     ):
         super().__init__()
 
@@ -127,7 +129,7 @@ def __init__(
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if safety_checker is None:
+        if safety_checker is None and requires_safety_checker:
             logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Alt Diffusion license and do not expose unfiltered"
@@ -137,6 +139,12 @@ def __init__(
                 " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
             )
 
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -146,6 +154,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
         r"""
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index 8d702b1b0279..25643a0c36bd 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -132,6 +132,7 @@ class CycleDiffusionPipeline(DiffusionPipeline):
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
+    _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
         self,
@@ -142,6 +143,7 @@ def __init__(
         scheduler: DDIMScheduler,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
     ):
         super().__init__()
 
@@ -159,7 +161,7 @@ def __init__(
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if safety_checker is None:
+        if safety_checker is None and requires_safety_checker:
             logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
@@ -169,6 +171,12 @@ def __init__(
                 " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
             )
 
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -178,6 +186,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing
     def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
index f64bd4340bac..92abf6bf770b 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
@@ -51,6 +51,7 @@ def __init__(
         scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
         safety_checker: OnnxRuntimeModel,
         feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
     ):
         super().__init__()
 
@@ -81,6 +82,22 @@ def __init__(
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
 
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
         self.register_modules(
             vae_encoder=vae_encoder,
             vae_decoder=vae_decoder,
@@ -91,6 +108,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
         r"""
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
index 1fc4786e4792..6ef6d390eeda 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -87,6 +87,7 @@ def __init__(
         scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
         safety_checker: OnnxRuntimeModel,
         feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
     ):
         super().__init__()
 
@@ -117,7 +118,7 @@ def __init__(
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if safety_checker is None:
+        if safety_checker is None and requires_safety_checker:
             logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
@@ -127,6 +128,12 @@ def __init__(
                 " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
             )
 
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
         self.register_modules(
             vae_encoder=vae_encoder,
             vae_decoder=vae_decoder,
@@ -137,6 +144,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline._encode_prompt
     def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
index bbb193a76778..1ceb74fb4fec 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -100,6 +100,7 @@ def __init__(
         scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
         safety_checker: OnnxRuntimeModel,
         feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
     ):
         super().__init__()
         logger.info("`OnnxStableDiffusionInpaintPipeline` is experimental and will very likely change in the future.")
@@ -131,7 +132,7 @@ def __init__(
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if safety_checker is None:
+        if safety_checker is None and requires_safety_checker:
             logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
@@ -141,6 +142,12 @@ def __init__(
                 " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
             )
 
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
         self.register_modules(
             vae_encoder=vae_encoder,
             vae_decoder=vae_decoder,
@@ -151,6 +158,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline._encode_prompt
     def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py
index 34f1d0e95dc0..03c3e5397b47 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py
@@ -86,6 +86,7 @@ def __init__(
         scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
         safety_checker: OnnxRuntimeModel,
         feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
     ):
         super().__init__()
 
@@ -116,7 +117,7 @@ def __init__(
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if safety_checker is None:
+        if safety_checker is None and requires_safety_checker:
             logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
@@ -126,6 +127,12 @@ def __init__(
                 " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
             )
 
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
         self.register_modules(
             vae_encoder=vae_encoder,
             vae_decoder=vae_decoder,
@@ -136,6 +143,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline._encode_prompt
     def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 5f3cc41b65a4..d88f6f31d36c 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -66,6 +66,7 @@ class StableDiffusionPipeline(DiffusionPipeline):
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
+    _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
         self,
@@ -83,6 +84,7 @@ def __init__(
         ],
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
     ):
         super().__init__()
 
@@ -113,7 +115,7 @@ def __init__(
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if safety_checker is None:
+        if safety_checker is None and requires_safety_checker:
             logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
@@ -123,6 +125,12 @@ def __init__(
                 " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
             )
 
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -132,6 +140,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     def enable_xformers_memory_efficient_attention(self):
         r"""
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
index f23161c51c3c..debffb5a6048 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -63,6 +63,7 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline):
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
+    _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
         self,
@@ -79,10 +80,11 @@ def __init__(
         ],
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
     ):
         super().__init__()
 
-        if safety_checker is None:
+        if safety_checker is None and requires_safety_checker:
             logger.warn(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
@@ -92,6 +94,12 @@ def __init__(
                 " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
             )
 
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
         self.register_modules(
             vae=vae,
             image_encoder=image_encoder,
@@ -100,6 +108,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention
     def enable_xformers_memory_efficient_attention(self):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 7efd39e72656..fbc779a36f13 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -78,6 +78,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
+    _optional_components = ["safety_checker", "feature_extractor"]
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__
     def __init__(
@@ -96,6 +97,7 @@ def __init__(
         ],
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
     ):
         super().__init__()
 
@@ -126,7 +128,7 @@ def __init__(
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if safety_checker is None:
+        if safety_checker is None and requires_safety_checker:
             logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
@@ -136,6 +138,12 @@ def __init__(
                 " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
             )
 
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -145,6 +153,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing
     def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 544f7583981e..24c94572b12e 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -150,6 +150,7 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
+    _optional_components = ["safety_checker", "feature_extractor"]
 
     def __init__(
         self,
@@ -160,6 +161,7 @@ def __init__(
         scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
     ):
         super().__init__()
 
@@ -191,7 +193,7 @@ def __init__(
             new_config["skip_prk_steps"] = True
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if safety_checker is None:
+        if safety_checker is None and requires_safety_checker:
             logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
@@ -201,6 +203,12 @@ def __init__(
                 " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
             )
 
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -210,6 +218,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing
     def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 003b2668e75a..e9c53e554a8d 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -91,6 +91,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
         feature_extractor ([`CLIPFeatureExtractor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
+    _optional_components = ["safety_checker", "feature_extractor"]
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__
     def __init__(
@@ -109,6 +110,7 @@ def __init__(
         ],
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
     ):
         super().__init__()
 
@@ -139,7 +141,7 @@ def __init__(
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if safety_checker is None:
+        if safety_checker is None and requires_safety_checker:
             logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
@@ -149,6 +151,12 @@ def __init__(
                 " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
             )
 
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -158,6 +166,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing
     def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
index 1be95418d3e2..592542cd733d 100644
--- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
+++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
@@ -56,6 +56,8 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
     """
 
+    _optional_components = ["safety_checker", "feature_extractor"]
+
     def __init__(
         self,
         vae: AutoencoderKL,
@@ -72,6 +74,7 @@ def __init__(
         ],
         safety_checker: SafeStableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
     ):
         super().__init__()
         safety_concept: Optional[str] = (
@@ -107,7 +110,7 @@ def __init__(
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
 
-        if safety_checker is None:
+        if safety_checker is None and requires_safety_checker:
             logger.warning(
                 f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
                 " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
@@ -117,6 +120,12 @@ def __init__(
                 " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
             )
 
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -127,6 +136,7 @@ def __init__(
             feature_extractor=feature_extractor,
         )
         self._safety_text_concept = safety_concept
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     @property
     def safety_concept(self):
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index f4b81b54c845..7e0304972e06 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -14,8 +14,10 @@
 # limitations under the License.
 
 import gc
+import json
 import os
 import random
+import shutil
 import tempfile
 import unittest
 from functools import partial
@@ -40,7 +42,6 @@
     StableDiffusionPipeline,
     UNet2DConditionModel,
     UNet2DModel,
-    VQModel,
     logging,
 )
 from diffusers.pipeline_utils import DiffusionPipeline
@@ -284,32 +285,7 @@ def dummy_cond_unet(self, sample_size=32):
         )
         return model
 
-    def dummy_cond_unet_inpaint(self, sample_size=32):
-        torch.manual_seed(0)
-        model = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=sample_size,
-            in_channels=9,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        return model
-
-    def dummy_vq_model(self):
-        torch.manual_seed(0)
-        model = VQModel(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=3,
-        )
-        return model
-
+    @property
     def dummy_vae(self):
         torch.manual_seed(0)
         model = AutoencoderKL(
@@ -322,6 +298,7 @@ def dummy_vae(self):
         )
         return model
 
+    @property
     def dummy_text_encoder(self):
         torch.manual_seed(0)
         config = CLIPTextConfig(
@@ -337,6 +314,7 @@ def dummy_text_encoder(self):
         )
         return CLIPTextModel(config)
 
+    @property
     def dummy_extractor(self):
         def extract(*args, **kwargs):
             class Out:
@@ -383,8 +361,8 @@ def test_stable_diffusion_components(self):
         """Test that components property works correctly"""
         unet = self.dummy_cond_unet()
         scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae()
-        bert = self.dummy_text_encoder()
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
         tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
 
         image = self.dummy_image().cpu().permute(0, 2, 3, 1)[0]
@@ -399,7 +377,7 @@ def test_stable_diffusion_components(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor(),
+            feature_extractor=self.dummy_extractor,
         ).to(torch_device)
         img2img = StableDiffusionImg2ImgPipeline(**inpaint.components).to(torch_device)
         text2img = StableDiffusionPipeline(**inpaint.components).to(torch_device)
@@ -439,7 +417,7 @@ def test_stable_diffusion_components(self):
         assert image_text2img.shape == (1, 64, 64, 3)
 
     def test_set_scheduler(self):
-        unet = self.dummy_cond_unet
+        unet = self.dummy_cond_unet()
         scheduler = PNDMScheduler(skip_prk_steps=True)
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
@@ -471,7 +449,7 @@ def test_set_scheduler(self):
         assert isinstance(sd.scheduler, DPMSolverMultistepScheduler)
 
     def test_set_scheduler_consistency(self):
-        unet = self.dummy_cond_unet
+        unet = self.dummy_cond_unet()
         pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
         ddim = DDIMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
         vae = self.dummy_vae
@@ -514,6 +492,110 @@ def test_set_scheduler_consistency(self):
 
         assert dict(ddim_config) == dict(ddim_config_2)
 
+    def test_optional_components(self):
+        unet = self.dummy_cond_unet()
+        pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        orig_sd = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=pndm,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=unet,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd = orig_sd
+
+        assert sd.config.requires_safety_checker is True
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            sd.save_pretrained(tmpdirname)
+
+            # Test that passing None works
+            sd = StableDiffusionPipeline.from_pretrained(
+                tmpdirname, feature_extractor=None, safety_checker=None, requires_safety_checker=False
+            )
+
+            assert sd.config.requires_safety_checker is False
+            assert sd.config.safety_checker == (None, None)
+            assert sd.config.feature_extractor == (None, None)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            sd.save_pretrained(tmpdirname)
+
+            # Test that loading previous None works
+            sd = StableDiffusionPipeline.from_pretrained(tmpdirname)
+
+            assert sd.config.requires_safety_checker is False
+            assert sd.config.safety_checker == (None, None)
+            assert sd.config.feature_extractor == (None, None)
+
+            orig_sd.save_pretrained(tmpdirname)
+
+            # Test that loading without any directory works
+            shutil.rmtree(os.path.join(tmpdirname, "safety_checker"))
+            with open(os.path.join(tmpdirname, sd.config_name)) as f:
+                config = json.load(f)
+                config["safety_checker"] = [None, None]
+            with open(os.path.join(tmpdirname, sd.config_name), "w") as f:
+                json.dump(config, f)
+
+            sd = StableDiffusionPipeline.from_pretrained(tmpdirname, requires_safety_checker=False)
+            sd.save_pretrained(tmpdirname)
+            sd = StableDiffusionPipeline.from_pretrained(tmpdirname)
+
+            assert sd.config.requires_safety_checker is False
+            assert sd.config.safety_checker == (None, None)
+            assert sd.config.feature_extractor == (None, None)
+
+            # Test that loading from deleted model index works
+            with open(os.path.join(tmpdirname, sd.config_name)) as f:
+                config = json.load(f)
+                del config["safety_checker"]
+                del config["feature_extractor"]
+            with open(os.path.join(tmpdirname, sd.config_name), "w") as f:
+                json.dump(config, f)
+
+            sd = StableDiffusionPipeline.from_pretrained(tmpdirname)
+
+            assert sd.config.requires_safety_checker is False
+            assert sd.config.safety_checker == (None, None)
+            assert sd.config.feature_extractor == (None, None)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            sd.save_pretrained(tmpdirname)
+
+            # Test that partially loading works
+            sd = StableDiffusionPipeline.from_pretrained(tmpdirname, feature_extractor=self.dummy_extractor)
+
+            assert sd.config.requires_safety_checker is False
+            assert sd.config.safety_checker == (None, None)
+            assert sd.config.feature_extractor != (None, None)
+
+            # Test that partially loading works
+            sd = StableDiffusionPipeline.from_pretrained(
+                tmpdirname,
+                feature_extractor=self.dummy_extractor,
+                safety_checker=unet,
+                requires_safety_checker=[True, True],
+            )
+
+            assert sd.config.requires_safety_checker == [True, True]
+            assert sd.config.safety_checker != (None, None)
+            assert sd.config.feature_extractor != (None, None)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            sd.save_pretrained(tmpdirname)
+            sd = StableDiffusionPipeline.from_pretrained(tmpdirname, feature_extractor=self.dummy_extractor)
+
+            assert sd.config.requires_safety_checker == [True, True]
+            assert sd.config.safety_checker != (None, None)
+            assert sd.config.feature_extractor != (None, None)
+
 
 @slow
 class PipelineSlowTests(unittest.TestCase):

From 05a36d5c1a17d0a3dbc7a1efb83ba1bdbfbf1fb2 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 24 Nov 2022 20:33:52 +0100
Subject: [PATCH 247/269] Upscaling fixed (#1402)

* Upscaling fixed

* up

* more fixes

* fix

* more fixes

* finish again

* up
---
 src/diffusers/pipeline_utils.py               |  20 ++--
 .../alt_diffusion/pipeline_alt_diffusion.py   |  11 +-
 .../pipeline_alt_diffusion_img2img.py         |   1 +
 .../pipeline_latent_diffusion.py              |   9 +-
 .../pipeline_flax_stable_diffusion.py         |  20 ++--
 .../pipeline_onnx_stable_diffusion.py         |  12 +-
 .../pipeline_onnx_stable_diffusion_inpaint.py |  16 ++-
 ...ne_onnx_stable_diffusion_inpaint_legacy.py |   7 +-
 .../pipeline_stable_diffusion.py              |  11 +-
 ...peline_stable_diffusion_image_variation.py |  11 +-
 .../pipeline_stable_diffusion_img2img.py      |   1 +
 .../pipeline_stable_diffusion_inpaint.py      |  15 ++-
 ...ipeline_stable_diffusion_inpaint_legacy.py |   7 +-
 .../pipeline_stable_diffusion_safe.py         |  11 +-
 .../pipeline_versatile_diffusion.py           |  13 ++-
 ...ipeline_versatile_diffusion_dual_guided.py |  11 +-
 ...ine_versatile_diffusion_image_variation.py |  11 +-
 ...eline_versatile_diffusion_text_to_image.py |  11 +-
 .../altdiffusion/test_alt_diffusion.py        |   8 +-
 .../latent_diffusion/test_latent_diffusion.py |   4 +-
 .../stable_diffusion/test_stable_diffusion.py | 104 +++++++++++++++---
 .../test_stable_diffusion_image_variation.py  |   5 +-
 .../test_stable_diffusion_inpaint.py          |  15 +--
 .../test_stable_diffusion_inpaint_legacy.py   |   6 +-
 .../test_safe_diffusion.py                    |   4 +-
 tests/test_pipelines.py                       |   2 +-
 26 files changed, 226 insertions(+), 120 deletions(-)

diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 224d6a7f8e8a..d2c5516220bc 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -544,7 +544,14 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         init_kwargs = {**init_kwargs, **passed_pipe_kwargs}
 
         # remove `null` components
-        init_dict = {k: v for k, v in init_dict.items() if v[0] is not None}
+        def load_module(name, value):
+            if value[0] is None:
+                return False
+            if name in passed_class_obj and passed_class_obj[name] is None:
+                return False
+            return True
+
+        init_dict = {k: v for k, v in init_dict.items() if load_module(k, v)}
 
         if len(unused_kwargs) > 0:
             logger.warning(f"Keyword arguments {unused_kwargs} not recognized.")
@@ -560,12 +567,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
             is_pipeline_module = hasattr(pipelines, library_name)
             loaded_sub_model = None
-            sub_model_should_be_defined = True
 
             # if the model is in a pipeline module, then we load it from the pipeline
             if name in passed_class_obj:
                 # 1. check that passed_class_obj has correct parent class
-                if not is_pipeline_module and passed_class_obj[name] is not None:
+                if not is_pipeline_module:
                     library = importlib.import_module(library_name)
                     class_obj = getattr(library, class_name)
                     importable_classes = LOADABLE_CLASSES[library_name]
@@ -581,12 +587,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                             f"{passed_class_obj[name]} is of type: {type(passed_class_obj[name])}, but should be"
                             f" {expected_class_obj}"
                         )
-                elif passed_class_obj[name] is None and name not in pipeline_class._optional_components:
-                    logger.warning(
-                        f"You have passed `None` for {name} to disable its functionality in {pipeline_class}. Note"
-                        f" that this might lead to problems when using {pipeline_class} and is not recommended."
-                    )
-                    sub_model_should_be_defined = False
                 else:
                     logger.warning(
                         f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it"
@@ -608,7 +608,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 importable_classes = LOADABLE_CLASSES[library_name]
                 class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
 
-            if loaded_sub_model is None and sub_model_should_be_defined:
+            if loaded_sub_model is None:
                 load_method_name = None
                 for class_name, class_candidate in class_candidates.items():
                     if class_candidate is not None and issubclass(class_obj, class_candidate):
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index 718b6b652aaa..1a11bfa4547f 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -141,6 +141,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     def enable_xformers_memory_efficient_attention(self):
@@ -379,7 +380,7 @@ def check_inputs(self, prompt, height, width, callback_steps):
             )
 
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // 8, width // 8)
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
         if latents is None:
             if device.type == "mps":
                 # randn does not work reproducibly on mps
@@ -420,9 +421,9 @@ def __call__(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -469,8 +470,8 @@ def __call__(
             (nsfw) content, according to the `safety_checker`.
         """
         # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * 8
-        width = width or self.unet.config.sample_size * 8
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(prompt, height, width, callback_steps)
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index ba4462bc2049..2d81a42554a5 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -154,6 +154,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
index a52f19ca3178..0e903cb836a3 100644
--- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
@@ -60,6 +60,7 @@ def __init__(
     ):
         super().__init__()
         self.register_modules(vqvae=vqvae, bert=bert, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+        self.vae_scale_factor = 2 ** (len(self.vqvae.config.block_out_channels) - 1)
 
     @torch.no_grad()
     def __call__(
@@ -79,9 +80,9 @@ def __call__(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -107,8 +108,8 @@ def __call__(
             generated images.
         """
         # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * 8
-        width = width or self.unet.config.sample_size * 8
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
 
         if isinstance(prompt, str):
             batch_size = 1
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
index 8c2ed06e4966..e682030c89cc 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
@@ -106,6 +106,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
 
     def prepare_inputs(self, prompt: Union[str, List[str]]):
         if not isinstance(prompt, (str, list)):
@@ -168,8 +169,8 @@ def _generate(
         neg_prompt_ids: jnp.array = None,
     ):
         # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * 8
-        width = width or self.unet.config.sample_size * 8
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
 
         if height % 8 != 0 or width % 8 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
@@ -192,7 +193,12 @@ def _generate(
         uncond_embeddings = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
         context = jnp.concatenate([uncond_embeddings, text_embeddings])
 
-        latents_shape = (batch_size, self.unet.in_channels, height // 8, width // 8)
+        latents_shape = (
+            batch_size,
+            self.unet.in_channels,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
         if latents is None:
             latents = jax.random.normal(prng_seed, shape=latents_shape, dtype=jnp.float32)
         else:
@@ -269,9 +275,9 @@ def __call__(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -307,8 +313,8 @@ def __call__(
             "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
         """
         # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * 8
-        width = width or self.unet.config.sample_size * 8
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
 
         if jit:
             images = _p_generate(
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
index 92abf6bf770b..71f5dae03471 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
@@ -108,6 +108,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
@@ -206,8 +207,8 @@ def __call__(
         **kwargs,
     ):
         # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * 8
-        width = width or self.unet.config.sample_size * 8
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
 
         if isinstance(prompt, str):
             batch_size = 1
@@ -241,7 +242,12 @@ def __call__(
 
         # get the initial random noise unless the user supplied it
         latents_dtype = text_embeddings.dtype
-        latents_shape = (batch_size * num_images_per_prompt, 4, height // 8, width // 8)
+        latents_shape = (
+            batch_size * num_images_per_prompt,
+            4,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
         if latents is None:
             latents = generator.randn(*latents_shape).astype(latents_dtype)
         elif latents.shape != latents_shape:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
index 1ceb74fb4fec..94da6bdbb27f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -158,6 +158,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline._encode_prompt
@@ -273,9 +274,9 @@ def __call__(
                 repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
                 to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
                 instead of 3, so the expected shape would be `(B, H, W, 1)`.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -321,8 +322,8 @@ def __call__(
             (nsfw) content, according to the `safety_checker`.
         """
         # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * 8
-        width = width or self.unet.config.sample_size * 8
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
 
         if isinstance(prompt, str):
             batch_size = 1
@@ -358,7 +359,12 @@ def __call__(
         )
 
         num_channels_latents = NUM_LATENT_CHANNELS
-        latents_shape = (batch_size * num_images_per_prompt, num_channels_latents, height // 8, width // 8)
+        latents_shape = (
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
         latents_dtype = text_embeddings.dtype
         if latents is None:
             latents = generator.randn(*latents_shape).astype(latents_dtype)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py
index 03c3e5397b47..55631f0a5834 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py
@@ -27,11 +27,11 @@ def preprocess(image):
     return 2.0 * image - 1.0
 
 
-def preprocess_mask(mask):
+def preprocess_mask(mask, scale_factor=8):
     mask = mask.convert("L")
     w, h = mask.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST)
+    mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL.Image.NEAREST)
     mask = np.array(mask).astype(np.float32) / 255.0
     mask = np.tile(mask, (4, 1, 1))
     mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
@@ -143,6 +143,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline._encode_prompt
@@ -349,7 +350,7 @@ def __call__(
 
         # preprocess mask
         if not isinstance(mask_image, np.ndarray):
-            mask_image = preprocess_mask(mask_image)
+            mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
         mask_image = mask_image.astype(latents_dtype)
         mask = np.concatenate([mask_image] * num_images_per_prompt, axis=0)
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index d88f6f31d36c..28acc7fcbd4e 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -140,6 +140,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     def enable_xformers_memory_efficient_attention(self):
@@ -378,7 +379,7 @@ def check_inputs(self, prompt, height, width, callback_steps):
             )
 
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // 8, width // 8)
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
         if latents is None:
             if device.type == "mps":
                 # randn does not work reproducibly on mps
@@ -419,9 +420,9 @@ def __call__(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -468,8 +469,8 @@ def __call__(
             (nsfw) content, according to the `safety_checker`.
         """
         # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * 8
-        width = width or self.unet.config.sample_size * 8
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(prompt, height, width, callback_steps)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
index debffb5a6048..2a351e5665c2 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -108,6 +108,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention
@@ -281,7 +282,7 @@ def check_inputs(self, image, height, width, callback_steps):
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // 8, width // 8)
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
         if latents is None:
             if device.type == "mps":
                 # randn does not work reproducibly on mps
@@ -324,9 +325,9 @@ def __call__(
                 configuration of
                 [this](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json)
                 `CLIPFeatureExtractor`
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -370,8 +371,8 @@ def __call__(
             (nsfw) content, according to the `safety_checker`.
         """
         # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * 8
-        width = width or self.unet.config.sample_size * 8
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(image, height, width, callback_steps)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index fbc779a36f13..97b0c20eb25a 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -153,6 +153,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 24c94572b12e..e768108d46e0 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -218,6 +218,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing
@@ -468,7 +469,7 @@ def check_inputs(self, prompt, height, width, callback_steps):
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // 8, width // 8)
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
         if latents is None:
             if device.type == "mps":
                 # randn does not work reproducibly on mps
@@ -490,7 +491,9 @@ def prepare_mask_latents(
         # resize the mask to latents shape as we concatenate the mask to the latents
         # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
         # and half precision
-        mask = torch.nn.functional.interpolate(mask, size=(height // 8, width // 8))
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
         mask = mask.to(device=device, dtype=dtype)
 
         masked_image = masked_image.to(device=device, dtype=dtype)
@@ -547,9 +550,9 @@ def __call__(
                 repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
                 to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
                 instead of 3, so the expected shape would be `(B, H, W, 1)`.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -596,8 +599,8 @@ def __call__(
             (nsfw) content, according to the `safety_checker`.
         """
         # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * 8
-        width = width or self.unet.config.sample_size * 8
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
 
         # 1. Check inputs
         self.check_inputs(prompt, height, width, callback_steps)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index e9c53e554a8d..d28e2bef5abe 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -51,11 +51,11 @@ def preprocess_image(image):
     return 2.0 * image - 1.0
 
 
-def preprocess_mask(mask):
+def preprocess_mask(mask, scale_factor=8):
     mask = mask.convert("L")
     w, h = mask.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    mask = mask.resize((w // 8, h // 8), resample=PIL_INTERPOLATION["nearest"])
+    mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
     mask = np.array(mask).astype(np.float32) / 255.0
     mask = np.tile(mask, (4, 1, 1))
     mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
@@ -166,6 +166,7 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing
@@ -541,7 +542,7 @@ def __call__(
             init_image = preprocess_image(init_image)
 
         if not isinstance(mask_image, torch.FloatTensor):
-            mask_image = preprocess_mask(mask_image)
+            mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
 
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
index 592542cd733d..c5adb168957a 100644
--- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
+++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
@@ -136,6 +136,7 @@ def __init__(
             feature_extractor=feature_extractor,
         )
         self._safety_text_concept = safety_concept
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     @property
@@ -443,7 +444,7 @@ def check_inputs(self, prompt, height, width, callback_steps):
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // 8, width // 8)
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
         if latents is None:
             if device.type == "mps":
                 # randn does not work reproducibly on mps
@@ -531,9 +532,9 @@ def __call__(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -600,8 +601,8 @@ def __call__(
             (nsfw) content, according to the `safety_checker`.
         """
         # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * 8
-        width = width or self.unet.config.sample_size * 8
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(prompt, height, width, callback_steps)
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
index c60a8836ec5c..7be7f4d3aee6 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py
@@ -78,6 +78,7 @@ def __init__(
             vae=vae,
             scheduler=scheduler,
         )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
 
     def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
         r"""
@@ -131,9 +132,9 @@ def image_variation(
         Args:
             image (`PIL.Image.Image`, `List[PIL.Image.Image]` or `torch.Tensor`):
                 The image prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
+            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
+            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -247,9 +248,9 @@ def text_to_image(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
+            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
+            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -360,9 +361,9 @@ def dual_guided(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
+            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
+            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
index e5dc59389f0a..2e150ca89787 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
@@ -87,6 +87,7 @@ def __init__(
             vae=vae,
             scheduler=scheduler,
         )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
 
         if self.text_unet is not None and (
             "dual_cross_attention" not in self.image_unet.config or not self.image_unet.config.dual_cross_attention
@@ -419,7 +420,7 @@ def check_inputs(self, prompt, image, height, width, callback_steps):
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // 8, width // 8)
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
         if latents is None:
             if device.type == "mps":
                 # randn does not work reproducibly on mps
@@ -474,9 +475,9 @@ def __call__(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
+            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
+            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -552,8 +553,8 @@ def __call__(
             returning a tuple, the first element is a list with the generated images.
         """
         # 0. Default height and width to unet
-        height = height or self.image_unet.config.sample_size * 8
-        width = width or self.image_unet.config.sample_size * 8
+        height = height or self.image_unet.config.sample_size * self.vae_scale_factor
+        width = width or self.image_unet.config.sample_size * self.vae_scale_factor
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(prompt, image, height, width, callback_steps)
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
index 53c67d8c2e85..25947570628e 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
@@ -71,6 +71,7 @@ def __init__(
             vae=vae,
             scheduler=scheduler,
         )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention with unet->image_unet
     def enable_xformers_memory_efficient_attention(self):
@@ -277,7 +278,7 @@ def check_inputs(self, image, height, width, callback_steps):
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // 8, width // 8)
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
         if latents is None:
             if device.type == "mps":
                 # randn does not work reproducibly on mps
@@ -318,9 +319,9 @@ def __call__(
         Args:
             image (`PIL.Image.Image`, `List[PIL.Image.Image]` or `torch.Tensor`):
                 The image prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
+            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
+            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -392,8 +393,8 @@ def __call__(
             (nsfw) content, according to the `safety_checker`.
         """
         # 0. Default height and width to unet
-        height = height or self.image_unet.config.sample_size * 8
-        width = width or self.image_unet.config.sample_size * 8
+        height = height or self.image_unet.config.sample_size * self.vae_scale_factor
+        width = width or self.image_unet.config.sample_size * self.vae_scale_factor
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(image, height, width, callback_steps)
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
index f38b604bd648..4845d5cab5ec 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
@@ -75,6 +75,7 @@ def __init__(
             vae=vae,
             scheduler=scheduler,
         )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
 
         if self.text_unet is not None:
             self._swap_unet_attention_blocks()
@@ -337,7 +338,7 @@ def check_inputs(self, prompt, height, width, callback_steps):
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // 8, width // 8)
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
         if latents is None:
             if device.type == "mps":
                 # randn does not work reproducibly on mps
@@ -378,9 +379,9 @@ def __call__(
         Args:
             prompt (`str` or `List[str]`):
                 The prompt or prompts to guide the image generation.
-            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
+            height (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
+            width (`int`, *optional*, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -444,8 +445,8 @@ def __call__(
             (nsfw) content, according to the `safety_checker`.
         """
         # 0. Default height and width to unet
-        height = height or self.image_unet.config.sample_size * 8
-        width = width or self.image_unet.config.sample_size * 8
+        height = height or self.image_unet.config.sample_size * self.vae_scale_factor
+        width = width or self.image_unet.config.sample_size * self.vae_scale_factor
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(prompt, height, width, callback_steps)
diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion.py b/tests/pipelines/altdiffusion/test_alt_diffusion.py
index bcbadfb3dbd4..91fe76444920 100644
--- a/tests/pipelines/altdiffusion/test_alt_diffusion.py
+++ b/tests/pipelines/altdiffusion/test_alt_diffusion.py
@@ -172,7 +172,9 @@ def test_alt_diffusion_ddim(self):
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5748162, 0.60447145, 0.48821217, 0.50100636, 0.5431185, 0.45763683, 0.49657696, 0.48132733, 0.47573093])
+        expected_slice = np.array(
+            [0.5748162, 0.60447145, 0.48821217, 0.50100636, 0.5431185, 0.45763683, 0.49657696, 0.48132733, 0.47573093]
+        )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
@@ -219,7 +221,9 @@ def test_alt_diffusion_pndm(self):
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.51605093, 0.5707241, 0.47365507, 0.50578886, 0.5633877, 0.4642503, 0.5182081, 0.48763484, 0.49084237])
+        expected_slice = np.array(
+            [0.51605093, 0.5707241, 0.47365507, 0.50578886, 0.5633877, 0.4642503, 0.5182081, 0.48763484, 0.49084237]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion.py b/tests/pipelines/latent_diffusion/test_latent_diffusion.py
index 085cdb4e766b..9d5c07809dc0 100644
--- a/tests/pipelines/latent_diffusion/test_latent_diffusion.py
+++ b/tests/pipelines/latent_diffusion/test_latent_diffusion.py
@@ -111,8 +111,8 @@ def test_inference_text2img(self):
         image_slice = image[0, -3:, -3:, -1]
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5074, 0.5026, 0.4998, 0.4056, 0.3523, 0.4649, 0.5289, 0.5299, 0.4897])
+        assert image.shape == (1, 16, 16, 3)
+        expected_slice = np.array([0.6806, 0.5454, 0.5638, 0.4893, 0.4656, 0.4257, 0.6248, 0.5217, 0.5498])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 53f8024f6cc8..b63aeefce5a8 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -207,11 +207,22 @@ def test_stable_diffusion_ddim(self):
         )[0]
 
         image_slice = image[0, -3:, -3:, -1]
-        print(", ".join(image_slice.flatten().tolist()))
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5112, 0.4692, 0.4715, 0.5206, 0.4894, 0.5114, 0.5096, 0.4932, 0.4755])
+        expected_slice = np.array(
+            [
+                0.5643956661224365,
+                0.6017904281616211,
+                0.4799129366874695,
+                0.5267305374145508,
+                0.5584856271743774,
+                0.46413588523864746,
+                0.5159522294998169,
+                0.4963662028312683,
+                0.47919973731040955,
+            ]
+        )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
@@ -256,12 +267,13 @@ def test_stable_diffusion_ddim_factor_8(self):
             num_inference_steps=2,
             output_type="np",
         )
+        sd_pipe.enable_attention_slicing()
         image = output.images
 
         image_slice = image[0, -3:, -3:, -1]
 
-        assert image.shape == (1, 134, 134, 3)
-        expected_slice = np.array([0.7834, 0.5488, 0.5781, 0.46, 0.3609, 0.5369, 0.542, 0.4855, 0.5557])
+        assert image.shape == (1, 536, 536, 3)
+        expected_slice = np.array([0.5445, 0.8108, 0.6242, 0.4863, 0.5779, 0.5423, 0.4749, 0.4589, 0.4616])
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
@@ -303,11 +315,22 @@ def test_stable_diffusion_pndm(self):
         )[0]
 
         image_slice = image[0, -3:, -3:, -1]
-        print(", ".join(image_slice.flatten().tolist()))
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.4937, 0.4649, 0.4716, 0.5145, 0.4889, 0.513, 0.513, 0.4905, 0.4738])
+        expected_slice = np.array(
+            [
+                0.5094760060310364,
+                0.5674174427986145,
+                0.46675148606300354,
+                0.5125715136528015,
+                0.5696930289268494,
+                0.4674668312072754,
+                0.5277683734893799,
+                0.4964486062526703,
+                0.494540274143219,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
@@ -370,11 +393,22 @@ def test_stable_diffusion_k_lms(self):
         )[0]
 
         image_slice = image[0, -3:, -3:, -1]
-        print(", ".join(image_slice.flatten().tolist()))
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5067, 0.4689, 0.4614, 0.5233, 0.4903, 0.5112, 0.524, 0.5069, 0.4785])
+        expected_slice = np.array(
+            [
+                0.47082293033599854,
+                0.5371589064598083,
+                0.4562119245529175,
+                0.5220914483070374,
+                0.5733777284622192,
+                0.4795039892196655,
+                0.5465868711471558,
+                0.5074326395988464,
+                0.5042197108268738,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
@@ -416,11 +450,22 @@ def test_stable_diffusion_k_euler_ancestral(self):
         )[0]
 
         image_slice = image[0, -3:, -3:, -1]
-        print(", ".join(image_slice.flatten().tolist()))
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5067, 0.4689, 0.4614, 0.5233, 0.4903, 0.5112, 0.524, 0.5069, 0.4785])
+        expected_slice = np.array(
+            [
+                0.4707113206386566,
+                0.5372191071510315,
+                0.4563021957874298,
+                0.5220003724098206,
+                0.5734264850616455,
+                0.4794946610927582,
+                0.5463782548904419,
+                0.5074145197868347,
+                0.504422664642334,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
@@ -462,11 +507,22 @@ def test_stable_diffusion_k_euler(self):
         )[0]
 
         image_slice = image[0, -3:, -3:, -1]
-        print(", ".join(image_slice.flatten().tolist()))
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5067, 0.4689, 0.4614, 0.5233, 0.4903, 0.5112, 0.524, 0.5069, 0.4785])
+        expected_slice = np.array(
+            [
+                0.47082313895225525,
+                0.5371587872505188,
+                0.4562119245529175,
+                0.5220913887023926,
+                0.5733776688575745,
+                0.47950395941734314,
+                0.546586811542511,
+                0.5074326992034912,
+                0.5042197108268738,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
@@ -539,7 +595,19 @@ def test_stable_diffusion_negative_prompt(self):
         image_slice = image[0, -3:, -3:, -1]
 
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.4851, 0.4617, 0.4765, 0.5127, 0.4845, 0.5153, 0.5141, 0.4886, 0.4719])
+        expected_slice = np.array(
+            [
+                0.5108221173286438,
+                0.5688379406929016,
+                0.4685141146183014,
+                0.5098261833190918,
+                0.5657756328582764,
+                0.4631010890007019,
+                0.5226285457611084,
+                0.49129390716552734,
+                0.4899061322212219,
+            ]
+        )
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
     def test_stable_diffusion_num_images_per_prompt(self):
@@ -700,18 +768,18 @@ def test_stable_diffusion_height_width_opt(self):
 
         output = sd_pipe(prompt, number_of_steps=2, output_type="np")
         image_shape = output.images[0].shape[:2]
-        assert image_shape == [32, 32]
+        assert image_shape == (64, 64)
 
-        output = sd_pipe(prompt, number_of_steps=2, height=64, width=64, output_type="np")
+        output = sd_pipe(prompt, number_of_steps=2, height=96, width=96, output_type="np")
         image_shape = output.images[0].shape[:2]
-        assert image_shape == [64, 64]
+        assert image_shape == (96, 96)
 
         config = dict(sd_pipe.unet.config)
         config["sample_size"] = 96
-        sd_pipe.unet = UNet2DConditionModel.from_config(config)
+        sd_pipe.unet = UNet2DConditionModel.from_config(config).to(torch_device)
         output = sd_pipe(prompt, number_of_steps=2, output_type="np")
         image_shape = output.images[0].shape[:2]
-        assert image_shape == [96, 96]
+        assert image_shape == (192, 192)
 
 
 @slow
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
index a9923089223c..9b350d42e1f0 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
@@ -154,11 +154,10 @@ def test_stable_diffusion_img_variation_default_case(self):
         )[0]
 
         image_slice = image[0, -3:, -3:, -1]
-        print(image_slice.flatten())
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.4935, 0.4784, 0.4802, 0.5027, 0.4805, 0.5149, 0.5143, 0.4879, 0.4731])
+        expected_slice = np.array([0.5093, 0.5717, 0.4806, 0.4891, 0.5552, 0.4594, 0.5177, 0.4894, 0.4904])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-3
 
@@ -197,7 +196,7 @@ def test_stable_diffusion_img_variation_multiple_images(self):
         image_slice = image[-1, -3:, -3:, -1]
 
         assert image.shape == (2, 64, 64, 3)
-        expected_slice = np.array([0.4939, 0.4627, 0.4831, 0.5710, 0.5387, 0.4428, 0.5230, 0.5545, 0.4586])
+        expected_slice = np.array([0.6427, 0.5452, 0.5602, 0.5478, 0.5968, 0.6211, 0.5538, 0.5514, 0.5281])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
 
     def test_stable_diffusion_img_variation_num_images_per_prompt(self):
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 0d8abe5394a1..e85fae939e9a 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -167,8 +167,8 @@ def test_stable_diffusion_inpaint(self):
         tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
 
         image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((128, 128))
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
 
         # make sure here that pndm scheduler skips prk
         sd_pipe = StableDiffusionInpaintPipeline(
@@ -213,7 +213,8 @@ def test_stable_diffusion_inpaint(self):
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5075, 0.4485, 0.4558, 0.5369, 0.5369, 0.5236, 0.5127, 0.4983, 0.4776])
+        expected_slice = np.array([0.4723, 0.5731, 0.3939, 0.5441, 0.5922, 0.4392, 0.5059, 0.4651, 0.4474])
+
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
@@ -226,8 +227,8 @@ def test_stable_diffusion_inpaint_with_num_images_per_prompt(self):
         tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
 
         image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((128, 128))
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
 
         # make sure here that pndm scheduler skips prk
         sd_pipe = StableDiffusionInpaintPipeline(
@@ -268,8 +269,8 @@ def test_stable_diffusion_inpaint_fp16(self):
         tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
 
         image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((128, 128))
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
 
         # put models in fp16
         unet = unet.half()
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
index 94106b6ba819..4b972c7b7d2a 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
@@ -168,7 +168,7 @@ def test_stable_diffusion_inpaint_legacy(self):
 
         image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
         init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
 
         # make sure here that pndm scheduler skips prk
         sd_pipe = StableDiffusionInpaintPipelineLegacy(
@@ -227,7 +227,7 @@ def test_stable_diffusion_inpaint_legacy_negative_prompt(self):
 
         image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
         init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
 
         # make sure here that pndm scheduler skips prk
         sd_pipe = StableDiffusionInpaintPipelineLegacy(
@@ -273,7 +273,7 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
 
         image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
         init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
 
         # make sure here that pndm scheduler skips prk
         sd_pipe = StableDiffusionInpaintPipelineLegacy(
diff --git a/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py b/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
index d80275c25794..dbb991479377 100644
--- a/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
+++ b/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
@@ -156,7 +156,7 @@ def test_safe_diffusion_ddim(self):
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5112, 0.4692, 0.4715, 0.5206, 0.4894, 0.5114, 0.5096, 0.4932, 0.4755])
+        expected_slice = np.array([0.5644, 0.6018, 0.4799, 0.5267, 0.5585, 0.4641, 0.516, 0.4964, 0.4792])
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
@@ -202,7 +202,7 @@ def test_stable_diffusion_pndm(self):
         image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
 
         assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.4937, 0.4649, 0.4716, 0.5145, 0.4889, 0.513, 0.513, 0.4905, 0.4738])
+        expected_slice = np.array([0.5095, 0.5674, 0.4668, 0.5126, 0.5697, 0.4675, 0.5278, 0.4964, 0.4945])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 7e0304972e06..a1bee2969687 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -367,7 +367,7 @@ def test_stable_diffusion_components(self):
 
         image = self.dummy_image().cpu().permute(0, 2, 3, 1)[0]
         init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((128, 128))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
 
         # make sure here that pndm scheduler skips prk
         inpaint = StableDiffusionInpaintPipelineLegacy(

From bb2c64a08c181b450afe61dd88b2f0a575bc414b Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Thu, 24 Nov 2022 21:57:27 +0100
Subject: [PATCH 248/269] Add the new SD2 attention params to the VD text unet
 (#1400)

---
 .../versatile_diffusion/modeling_text_unet.py      | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index 24e79729a599..e3c35dcb38bc 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -28,7 +28,9 @@ def get_down_block(
     resnet_groups=None,
     cross_attention_dim=None,
     downsample_padding=None,
-    dual_cross_attention=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
 ):
     down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
     if down_block_type == "DownBlockFlat":
@@ -58,6 +60,9 @@ def get_down_block(
             downsample_padding=downsample_padding,
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
         )
     raise ValueError(f"{down_block_type} is not supported.")
 
@@ -75,7 +80,9 @@ def get_up_block(
     attn_num_head_channels,
     resnet_groups=None,
     cross_attention_dim=None,
-    dual_cross_attention=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
 ):
     up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
     if up_block_type == "UpBlockFlat":
@@ -105,6 +112,9 @@ def get_up_block(
             resnet_groups=resnet_groups,
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
         )
     raise ValueError(f"{up_block_type} is not supported.")
 

From 8e2c4cd56cd75c076b04ad0869aca074f307bea7 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 24 Nov 2022 22:32:44 +0100
Subject: [PATCH 249/269] Deprecate sample size (#1406)

* up

* up

* fix

* uP

* more fixes

* up

* uP

* up

* up

* uP

* fix final tests
---
 src/diffusers/configuration_utils.py          | 12 ++--
 src/diffusers/modeling_utils.py               |  6 +-
 .../alt_diffusion/pipeline_alt_diffusion.py   | 22 ++++++
 .../pipeline_alt_diffusion_img2img.py         | 22 ++++++
 .../pipeline_cycle_diffusion.py               | 21 ++++++
 .../pipeline_flax_stable_diffusion.py         | 24 ++++++-
 .../pipeline_onnx_stable_diffusion.py         | 22 ++++++
 .../pipeline_onnx_stable_diffusion_img2img.py | 22 ++++++
 .../pipeline_onnx_stable_diffusion_inpaint.py | 22 ++++++
 ...ne_onnx_stable_diffusion_inpaint_legacy.py | 22 ++++++
 .../pipeline_stable_diffusion.py              | 22 ++++++
 ...peline_stable_diffusion_image_variation.py | 25 ++++++-
 .../pipeline_stable_diffusion_img2img.py      | 22 ++++++
 .../pipeline_stable_diffusion_inpaint.py      | 22 ++++++
 ...ipeline_stable_diffusion_inpaint_legacy.py | 22 ++++++
 .../pipeline_stable_diffusion_safe.py         | 22 ++++++
 src/diffusers/utils/deprecation_utils.py      |  2 +-
 .../stable_diffusion/test_stable_diffusion.py | 15 ++--
 tests/test_utils.py                           | 16 ++---
 v1-inference.yaml                             | 70 +++++++++++++++++++
 20 files changed, 407 insertions(+), 26 deletions(-)
 create mode 100644 v1-inference.yaml

diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py
index c4819ddc2ec6..eef901f8ff83 100644
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -91,9 +91,6 @@ class ConfigMixin:
     def register_to_config(self, **kwargs):
         if self.config_name is None:
             raise NotImplementedError(f"Make sure that {self.__class__} has defined a class name `config_name`")
-        kwargs["_class_name"] = self.__class__.__name__
-        kwargs["_diffusers_version"] = __version__
-
         # Special case for `kwargs` used in deprecation warning added to schedulers
         # TODO: remove this when we remove the deprecation warning, and the `kwargs` argument,
         # or solve in a more general way.
@@ -462,7 +459,7 @@ def extract_init_dict(cls, config_dict, **kwargs):
         unused_kwargs = {**config_dict, **kwargs}
 
         # 7. Define "hidden" config parameters that were saved for compatible classes
-        hidden_config_dict = {k: v for k, v in original_dict.items() if k not in init_dict and not k.startswith("_")}
+        hidden_config_dict = {k: v for k, v in original_dict.items() if k not in init_dict}
 
         return init_dict, unused_kwargs, hidden_config_dict
 
@@ -493,6 +490,9 @@ def to_json_string(self) -> str:
             `str`: String containing all the attributes that make up this configuration instance in JSON format.
         """
         config_dict = self._internal_dict if hasattr(self, "_internal_dict") else {}
+        config_dict["_class_name"] = self.__class__.__name__
+        config_dict["_diffusers_version"] = __version__
+
         return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
 
     def to_json_file(self, json_file_path: Union[str, os.PathLike]):
@@ -520,6 +520,7 @@ def register_to_config(init):
     def inner_init(self, *args, **kwargs):
         # Ignore private kwargs in the init.
         init_kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")}
+        config_init_kwargs = {k: v for k, v in kwargs.items() if k.startswith("_")}
         init(self, *args, **init_kwargs)
         if not isinstance(self, ConfigMixin):
             raise RuntimeError(
@@ -545,6 +546,7 @@ def inner_init(self, *args, **kwargs):
                 if k not in ignore and k not in new_kwargs
             }
         )
+        new_kwargs = {**config_init_kwargs, **new_kwargs}
         getattr(self, "register_to_config")(**new_kwargs)
 
     return inner_init
@@ -562,7 +564,7 @@ def init(self, *args, **kwargs):
             )
 
         # Ignore private kwargs in the init. Retrieve all passed attributes
-        init_kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")}
+        init_kwargs = {k: v for k, v in kwargs.items()}
 
         # Retrieve default values
         fields = dataclasses.fields(self)
diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py
index 704ba00cad29..8cb0acf52f42 100644
--- a/src/diffusers/modeling_utils.py
+++ b/src/diffusers/modeling_utils.py
@@ -448,7 +448,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         if low_cpu_mem_usage:
             # Instantiate model with empty weights
             with accelerate.init_empty_weights():
-                model, unused_kwargs = cls.from_config(
+                config, unused_kwargs = cls.load_config(
                     config_path,
                     cache_dir=cache_dir,
                     return_unused_kwargs=True,
@@ -462,6 +462,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     device_map=device_map,
                     **kwargs,
                 )
+                model = cls.from_config(config, **unused_kwargs)
 
             # if device_map is Non,e load the state dict on move the params from meta device to the cpu
             if device_map is None:
@@ -482,7 +483,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 "error_msgs": [],
             }
         else:
-            model, unused_kwargs = cls.from_config(
+            config, unused_kwargs = cls.load_config(
                 config_path,
                 cache_dir=cache_dir,
                 return_unused_kwargs=True,
@@ -496,6 +497,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 device_map=device_map,
                 **kwargs,
             )
+            model = cls.from_config(config, **unused_kwargs)
 
             state_dict = load_state_dict(model_file)
             model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index 1a11bfa4547f..285df656c6cf 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -18,6 +18,7 @@
 import torch
 
 from diffusers.utils import is_accelerate_available
+from packaging import version
 from transformers import CLIPFeatureExtractor, XLMRobertaTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -132,6 +133,27 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 2d81a42554a5..5622afaf324a 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -20,6 +20,7 @@
 
 import PIL
 from diffusers.utils import is_accelerate_available
+from packaging import version
 from transformers import CLIPFeatureExtractor, XLMRobertaTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -145,6 +146,27 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index 25643a0c36bd..7b445ef93fa5 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -20,6 +20,7 @@
 
 import PIL
 from diffusers.utils import is_accelerate_available
+from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -176,6 +177,26 @@ def __init__(
                 "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
 
         self.register_modules(
             vae=vae,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
index e682030c89cc..dbe3b7db9d38 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
@@ -23,6 +23,7 @@
 from flax.core.frozen_dict import FrozenDict
 from flax.jax_utils import unreplicate
 from flax.training.common_utils import shard
+from packaging import version
 from PIL import Image
 from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel
 
@@ -34,7 +35,7 @@
     FlaxLMSDiscreteScheduler,
     FlaxPNDMScheduler,
 )
-from ...utils import logging
+from ...utils import deprecate, logging
 from . import FlaxStableDiffusionPipelineOutput
 from .safety_checker_flax import FlaxStableDiffusionSafetyChecker
 
@@ -97,6 +98,27 @@ def __init__(
                 " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
             )
 
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
index 71f5dae03471..3caab834befd 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
@@ -18,6 +18,7 @@
 import numpy as np
 import torch
 
+from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -98,6 +99,27 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
         self.register_modules(
             vae_encoder=vae_encoder,
             vae_decoder=vae_decoder,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
index 6ef6d390eeda..4d42201676c9 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -19,6 +19,7 @@
 import torch
 
 import PIL
+from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -134,6 +135,27 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
         self.register_modules(
             vae_encoder=vae_encoder,
             vae_decoder=vae_decoder,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
index 94da6bdbb27f..863f7b7aaea8 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -19,6 +19,7 @@
 import torch
 
 import PIL
+from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -148,6 +149,27 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
         self.register_modules(
             vae_encoder=vae_encoder,
             vae_decoder=vae_decoder,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py
index 55631f0a5834..631e7129e950 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py
@@ -5,6 +5,7 @@
 import torch
 
 import PIL
+from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -133,6 +134,27 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
         self.register_modules(
             vae_encoder=vae_encoder,
             vae_decoder=vae_decoder,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 28acc7fcbd4e..f3ab6ce49569 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -18,6 +18,7 @@
 import torch
 
 from diffusers.utils import is_accelerate_available
+from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -131,6 +132,27 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
index 2a351e5665c2..822bb840278c 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -19,8 +19,10 @@
 
 import PIL
 from diffusers.utils import is_accelerate_available
+from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPVisionModelWithProjection
 
+from ...configuration_utils import FrozenDict
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...pipeline_utils import DiffusionPipeline
 from ...schedulers import (
@@ -31,7 +33,7 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from ...utils import logging
+from ...utils import deprecate, logging
 from . import StableDiffusionPipelineOutput
 from .safety_checker import StableDiffusionSafetyChecker
 
@@ -100,6 +102,27 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
         self.register_modules(
             vae=vae,
             image_encoder=image_encoder,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 97b0c20eb25a..1ebd7692185c 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -20,6 +20,7 @@
 
 import PIL
 from diffusers.utils import is_accelerate_available
+from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -144,6 +145,27 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index e768108d46e0..205505da269f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -20,6 +20,7 @@
 
 import PIL
 from diffusers.utils import is_accelerate_available
+from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -209,6 +210,27 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index d28e2bef5abe..15923d51c52f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -20,6 +20,7 @@
 
 import PIL
 from diffusers.utils import is_accelerate_available
+from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -157,6 +158,27 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
index c5adb168957a..7948bbecf8ec 100644
--- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
+++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
@@ -5,6 +5,7 @@
 import numpy as np
 import torch
 
+from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -126,6 +127,27 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
diff --git a/src/diffusers/utils/deprecation_utils.py b/src/diffusers/utils/deprecation_utils.py
index eac43031574f..7c8bfc901b31 100644
--- a/src/diffusers/utils/deprecation_utils.py
+++ b/src/diffusers/utils/deprecation_utils.py
@@ -32,7 +32,7 @@ def deprecate(*args, take_from: Optional[Union[Dict, Any]] = None, standard_warn
 
         if warning is not None:
             warning = warning + " " if standard_warn else ""
-            warnings.warn(warning + message, DeprecationWarning)
+            warnings.warn(warning + message, FutureWarning)
 
     if isinstance(deprecated_kwargs, dict) and len(deprecated_kwargs) > 0:
         call_frame = inspect.getouterframes(inspect.currentframe())[1]
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index b63aeefce5a8..0efcb9ad8839 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -262,18 +262,17 @@ def test_stable_diffusion_ddim_factor_8(self):
             [prompt],
             generator=generator,
             guidance_scale=6.0,
-            height=536,
-            width=536,
+            height=136,
+            width=136,
             num_inference_steps=2,
             output_type="np",
         )
-        sd_pipe.enable_attention_slicing()
         image = output.images
 
         image_slice = image[0, -3:, -3:, -1]
 
-        assert image.shape == (1, 536, 536, 3)
-        expected_slice = np.array([0.5445, 0.8108, 0.6242, 0.4863, 0.5779, 0.5423, 0.4749, 0.4589, 0.4616])
+        assert image.shape == (1, 136, 136, 3)
+        expected_slice = np.array([0.5524, 0.5626, 0.6069, 0.4727, 0.386, 0.3995, 0.4613, 0.4328, 0.4269])
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
@@ -766,18 +765,18 @@ def test_stable_diffusion_height_width_opt(self):
 
         prompt = "hey"
 
-        output = sd_pipe(prompt, number_of_steps=2, output_type="np")
+        output = sd_pipe(prompt, number_of_steps=1, output_type="np")
         image_shape = output.images[0].shape[:2]
         assert image_shape == (64, 64)
 
-        output = sd_pipe(prompt, number_of_steps=2, height=96, width=96, output_type="np")
+        output = sd_pipe(prompt, number_of_steps=1, height=96, width=96, output_type="np")
         image_shape = output.images[0].shape[:2]
         assert image_shape == (96, 96)
 
         config = dict(sd_pipe.unet.config)
         config["sample_size"] = 96
         sd_pipe.unet = UNet2DConditionModel.from_config(config).to(torch_device)
-        output = sd_pipe(prompt, number_of_steps=2, output_type="np")
+        output = sd_pipe(prompt, number_of_steps=1, output_type="np")
         image_shape = output.images[0].shape[:2]
         assert image_shape == (192, 192)
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 35cf57421014..761242eb9ac4 100755
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -26,7 +26,7 @@ class DeprecateTester(unittest.TestCase):
     def test_deprecate_function_arg(self):
         kwargs = {"deprecated_arg": 4}
 
-        with self.assertWarns(DeprecationWarning) as warning:
+        with self.assertWarns(FutureWarning) as warning:
             output = deprecate("deprecated_arg", self.higher_version, "message", take_from=kwargs)
 
         assert output == 4
@@ -39,7 +39,7 @@ def test_deprecate_function_arg(self):
     def test_deprecate_function_arg_tuple(self):
         kwargs = {"deprecated_arg": 4}
 
-        with self.assertWarns(DeprecationWarning) as warning:
+        with self.assertWarns(FutureWarning) as warning:
             output = deprecate(("deprecated_arg", self.higher_version, "message"), take_from=kwargs)
 
         assert output == 4
@@ -51,7 +51,7 @@ def test_deprecate_function_arg_tuple(self):
 
     def test_deprecate_function_args(self):
         kwargs = {"deprecated_arg_1": 4, "deprecated_arg_2": 8}
-        with self.assertWarns(DeprecationWarning) as warning:
+        with self.assertWarns(FutureWarning) as warning:
             output_1, output_2 = deprecate(
                 ("deprecated_arg_1", self.higher_version, "Hey"),
                 ("deprecated_arg_2", self.higher_version, "Hey"),
@@ -81,7 +81,7 @@ def test_deprecate_function_incorrect_arg(self):
         assert "got an unexpected keyword argument `deprecated_arg`" in str(error.exception)
 
     def test_deprecate_arg_no_kwarg(self):
-        with self.assertWarns(DeprecationWarning) as warning:
+        with self.assertWarns(FutureWarning) as warning:
             deprecate(("deprecated_arg", self.higher_version, "message"))
 
         assert (
@@ -90,7 +90,7 @@ def test_deprecate_arg_no_kwarg(self):
         )
 
     def test_deprecate_args_no_kwarg(self):
-        with self.assertWarns(DeprecationWarning) as warning:
+        with self.assertWarns(FutureWarning) as warning:
             deprecate(
                 ("deprecated_arg_1", self.higher_version, "Hey"),
                 ("deprecated_arg_2", self.higher_version, "Hey"),
@@ -108,7 +108,7 @@ def test_deprecate_class_obj(self):
         class Args:
             arg = 5
 
-        with self.assertWarns(DeprecationWarning) as warning:
+        with self.assertWarns(FutureWarning) as warning:
             arg = deprecate(("arg", self.higher_version, "message"), take_from=Args())
 
         assert arg == 5
@@ -122,7 +122,7 @@ class Args:
             arg = 5
             foo = 7
 
-        with self.assertWarns(DeprecationWarning) as warning:
+        with self.assertWarns(FutureWarning) as warning:
             arg_1, arg_2 = deprecate(
                 ("arg", self.higher_version, "message"),
                 ("foo", self.higher_version, "message"),
@@ -158,7 +158,7 @@ def test_deprecate_incorrect_version(self):
         )
 
     def test_deprecate_incorrect_no_standard_warn(self):
-        with self.assertWarns(DeprecationWarning) as warning:
+        with self.assertWarns(FutureWarning) as warning:
             deprecate(("deprecated_arg", self.higher_version, "This message is better!!!"), standard_warn=False)
 
         assert str(warning.warning) == "This message is better!!!"
diff --git a/v1-inference.yaml b/v1-inference.yaml
new file mode 100644
index 000000000000..d4effe569e89
--- /dev/null
+++ b/v1-inference.yaml
@@ -0,0 +1,70 @@
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

From d50e3217459558cc2979f38818f1835751d4fc97 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Thu, 24 Nov 2022 22:42:59 +0100
Subject: [PATCH 250/269] Support SD2 attention slicing (#1397)

* Support SD2 attention slicing

* Support SD2 attention slicing

* Add more copies

* Use attn_num_head_channels in blocks

* fix-copies

* Update tests

* fix imports
---
 src/diffusers/models/unet_2d_blocks.py        |  42 +-
 src/diffusers/models/unet_2d_condition.py     |  14 +-
 .../alt_diffusion/pipeline_alt_diffusion.py   |  11 +-
 .../pipeline_alt_diffusion_img2img.py         |  11 +-
 .../pipeline_cycle_diffusion.py               |  11 +-
 .../pipeline_stable_diffusion.py              |  11 +-
 ...peline_stable_diffusion_image_variation.py |  11 +-
 .../pipeline_stable_diffusion_img2img.py      |  11 +-
 .../pipeline_stable_diffusion_inpaint.py      |  11 +-
 ...ipeline_stable_diffusion_inpaint_legacy.py |  11 +-
 .../versatile_diffusion/modeling_text_unet.py |  56 +-
 ...ipeline_versatile_diffusion_dual_guided.py |  11 +-
 ...ine_versatile_diffusion_image_variation.py |  11 +-
 ...eline_versatile_diffusion_text_to_image.py |  11 +-
 .../pipelines/stable_diffusion_2/__init__.py  |   0
 .../test_stable_diffusion.py                  | 740 ++++++++++++++++++
 16 files changed, 892 insertions(+), 81 deletions(-)
 create mode 100644 tests/pipelines/stable_diffusion_2/__init__.py
 create mode 100644 tests/pipelines/stable_diffusion_2/test_stable_diffusion.py

diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index e919d21f4a03..6b4a88c0ae3d 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -404,15 +404,17 @@ def __init__(
         self.resnets = nn.ModuleList(resnets)
 
     def set_attention_slice(self, slice_size):
-        if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
+        head_dims = self.attn_num_head_channels
+        head_dims = [head_dims] if isinstance(head_dims, int) else head_dims
+        if slice_size is not None and any(dim % slice_size != 0 for dim in head_dims):
             raise ValueError(
-                f"Make sure slice_size {slice_size} is a divisor of "
-                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+                f"Make sure slice_size {slice_size} is a common divisor of "
+                f"the number of heads used in cross_attention: {head_dims}"
             )
-        if slice_size is not None and slice_size > self.attn_num_head_channels:
+        if slice_size is not None and slice_size > min(head_dims):
             raise ValueError(
-                f"Chunk_size {slice_size} has to be smaller or equal to "
-                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+                f"slice_size {slice_size} has to be smaller or equal to "
+                f"the lowest number of heads used in cross_attention: min({head_dims}) = {min(head_dims)}"
             )
 
         for attn in self.attentions:
@@ -600,15 +602,17 @@ def __init__(
         self.gradient_checkpointing = False
 
     def set_attention_slice(self, slice_size):
-        if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
+        head_dims = self.attn_num_head_channels
+        head_dims = [head_dims] if isinstance(head_dims, int) else head_dims
+        if slice_size is not None and any(dim % slice_size != 0 for dim in head_dims):
             raise ValueError(
-                f"Make sure slice_size {slice_size} is a divisor of "
-                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+                f"Make sure slice_size {slice_size} is a common divisor of "
+                f"the number of heads used in cross_attention: {head_dims}"
             )
-        if slice_size is not None and slice_size > self.attn_num_head_channels:
+        if slice_size is not None and slice_size > min(head_dims):
             raise ValueError(
-                f"Chunk_size {slice_size} has to be smaller or equal to "
-                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+                f"slice_size {slice_size} has to be smaller or equal to "
+                f"the lowest number of heads used in cross_attention: min({head_dims}) = {min(head_dims)}"
             )
 
         for attn in self.attentions:
@@ -1197,15 +1201,17 @@ def __init__(
         self.gradient_checkpointing = False
 
     def set_attention_slice(self, slice_size):
-        if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
+        head_dims = self.attn_num_head_channels
+        head_dims = [head_dims] if isinstance(head_dims, int) else head_dims
+        if slice_size is not None and any(dim % slice_size != 0 for dim in head_dims):
             raise ValueError(
-                f"Make sure slice_size {slice_size} is a divisor of "
-                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+                f"Make sure slice_size {slice_size} is a common divisor of "
+                f"the number of heads used in cross_attention: {head_dims}"
             )
-        if slice_size is not None and slice_size > self.attn_num_head_channels:
+        if slice_size is not None and slice_size > min(head_dims):
             raise ValueError(
-                f"Chunk_size {slice_size} has to be smaller or equal to "
-                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+                f"slice_size {slice_size} has to be smaller or equal to "
+                f"the lowest number of heads used in cross_attention: min({head_dims}) = {min(head_dims)}"
             )
 
         for attn in self.attentions:
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 97a26ced5400..1b43f960d997 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -229,15 +229,17 @@ def __init__(
         self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=3, padding=1)
 
     def set_attention_slice(self, slice_size):
-        if slice_size is not None and self.config.attention_head_dim % slice_size != 0:
+        head_dims = self.config.attention_head_dim
+        head_dims = [head_dims] if isinstance(head_dims, int) else head_dims
+        if slice_size is not None and any(dim % slice_size != 0 for dim in head_dims):
             raise ValueError(
-                f"Make sure slice_size {slice_size} is a divisor of "
-                f"the number of heads used in cross_attention {self.config.attention_head_dim}"
+                f"Make sure slice_size {slice_size} is a common divisor of "
+                f"the number of heads used in cross_attention: {head_dims}"
             )
-        if slice_size is not None and slice_size > self.config.attention_head_dim:
+        if slice_size is not None and slice_size > min(head_dims):
             raise ValueError(
-                f"Chunk_size {slice_size} has to be smaller or equal to "
-                f"the number of heads used in cross_attention {self.config.attention_head_dim}"
+                f"slice_size {slice_size} has to be smaller or equal to "
+                f"the lowest number of heads used in cross_attention: min({head_dims}) = {min(head_dims)}"
             )
 
         for block in self.down_blocks:
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index 285df656c6cf..893174a8692a 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -198,9 +198,14 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto
                 `attention_head_dim` must be a multiple of `slice_size`.
         """
         if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
+            if isinstance(self.unet.config.attention_head_dim, int):
+                # half the attention head size is usually a good trade-off between
+                # speed and memory
+                slice_size = self.unet.config.attention_head_dim // 2
+            else:
+                # if `attention_head_dim` is a list, take the smallest head size
+                slice_size = min(self.unet.config.attention_head_dim)
+
         self.unet.set_attention_slice(slice_size)
 
     def disable_attention_slicing(self):
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 5622afaf324a..f7baedde9813 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -193,9 +193,14 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto
                 `attention_head_dim` must be a multiple of `slice_size`.
         """
         if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
+            if isinstance(self.unet.config.attention_head_dim, int):
+                # half the attention head size is usually a good trade-off between
+                # speed and memory
+                slice_size = self.unet.config.attention_head_dim // 2
+            else:
+                # if `attention_head_dim` is a list, take the smallest head size
+                slice_size = min(self.unet.config.attention_head_dim)
+
         self.unet.set_attention_slice(slice_size)
 
     def disable_attention_slicing(self):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index 7b445ef93fa5..287fd74b64a1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -224,9 +224,14 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto
                 `attention_head_dim` must be a multiple of `slice_size`.
         """
         if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
+            if isinstance(self.unet.config.attention_head_dim, int):
+                # half the attention head size is usually a good trade-off between
+                # speed and memory
+                slice_size = self.unet.config.attention_head_dim // 2
+            else:
+                # if `attention_head_dim` is a list, take the smallest head size
+                slice_size = min(self.unet.config.attention_head_dim)
+
         self.unet.set_attention_slice(slice_size)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_attention_slicing
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index f3ab6ce49569..c9f96fca0bbe 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -197,9 +197,14 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto
                 `attention_head_dim` must be a multiple of `slice_size`.
         """
         if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
+            if isinstance(self.unet.config.attention_head_dim, int):
+                # half the attention head size is usually a good trade-off between
+                # speed and memory
+                slice_size = self.unet.config.attention_head_dim // 2
+            else:
+                # if `attention_head_dim` is a list, take the smallest head size
+                slice_size = min(self.unet.config.attention_head_dim)
+
         self.unet.set_attention_slice(slice_size)
 
     def disable_attention_slicing(self):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
index 822bb840278c..5e6aa9885cf8 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -169,9 +169,14 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto
                 `attention_head_dim` must be a multiple of `slice_size`.
         """
         if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
+            if isinstance(self.unet.config.attention_head_dim, int):
+                # half the attention head size is usually a good trade-off between
+                # speed and memory
+                slice_size = self.unet.config.attention_head_dim // 2
+            else:
+                # if `attention_head_dim` is a list, take the smallest head size
+                slice_size = min(self.unet.config.attention_head_dim)
+
         self.unet.set_attention_slice(slice_size)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_attention_slicing
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 1ebd7692185c..d86847fad653 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -193,9 +193,14 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto
                 `attention_head_dim` must be a multiple of `slice_size`.
         """
         if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
+            if isinstance(self.unet.config.attention_head_dim, int):
+                # half the attention head size is usually a good trade-off between
+                # speed and memory
+                slice_size = self.unet.config.attention_head_dim // 2
+            else:
+                # if `attention_head_dim` is a list, take the smallest head size
+                slice_size = min(self.unet.config.attention_head_dim)
+
         self.unet.set_attention_slice(slice_size)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_attention_slicing
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 205505da269f..6fee298bc47b 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -258,9 +258,14 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto
                 `attention_head_dim` must be a multiple of `slice_size`.
         """
         if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
+            if isinstance(self.unet.config.attention_head_dim, int):
+                # half the attention head size is usually a good trade-off between
+                # speed and memory
+                slice_size = self.unet.config.attention_head_dim // 2
+            else:
+                # if `attention_head_dim` is a list, take the smallest head size
+                slice_size = min(self.unet.config.attention_head_dim)
+
         self.unet.set_attention_slice(slice_size)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_attention_slicing
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index 15923d51c52f..e1e5a33bd4ba 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -206,9 +206,14 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto
                 `attention_head_dim` must be a multiple of `slice_size`.
         """
         if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.unet.config.attention_head_dim // 2
+            if isinstance(self.unet.config.attention_head_dim, int):
+                # half the attention head size is usually a good trade-off between
+                # speed and memory
+                slice_size = self.unet.config.attention_head_dim // 2
+            else:
+                # if `attention_head_dim` is a list, take the smallest head size
+                slice_size = min(self.unet.config.attention_head_dim)
+
         self.unet.set_attention_slice(slice_size)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_attention_slicing
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index e3c35dcb38bc..fb8855b95f8c 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -307,15 +307,17 @@ def __init__(
         self.conv_out = LinearMultiDim(block_out_channels[0], out_channels, kernel_size=3, padding=1)
 
     def set_attention_slice(self, slice_size):
-        if slice_size is not None and self.config.attention_head_dim % slice_size != 0:
+        head_dims = self.config.attention_head_dim
+        head_dims = [head_dims] if isinstance(head_dims, int) else head_dims
+        if slice_size is not None and any(dim % slice_size != 0 for dim in head_dims):
             raise ValueError(
-                f"Make sure slice_size {slice_size} is a divisor of "
-                f"the number of heads used in cross_attention {self.config.attention_head_dim}"
+                f"Make sure slice_size {slice_size} is a common divisor of "
+                f"the number of heads used in cross_attention: {head_dims}"
             )
-        if slice_size is not None and slice_size > self.config.attention_head_dim:
+        if slice_size is not None and slice_size > min(head_dims):
             raise ValueError(
-                f"Chunk_size {slice_size} has to be smaller or equal to "
-                f"the number of heads used in cross_attention {self.config.attention_head_dim}"
+                f"slice_size {slice_size} has to be smaller or equal to "
+                f"the lowest number of heads used in cross_attention: min({head_dims}) = {min(head_dims)}"
             )
 
         for block in self.down_blocks:
@@ -743,15 +745,17 @@ def __init__(
         self.gradient_checkpointing = False
 
     def set_attention_slice(self, slice_size):
-        if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
+        head_dims = self.attn_num_head_channels
+        head_dims = [head_dims] if isinstance(head_dims, int) else head_dims
+        if slice_size is not None and any(dim % slice_size != 0 for dim in head_dims):
             raise ValueError(
-                f"Make sure slice_size {slice_size} is a divisor of "
-                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+                f"Make sure slice_size {slice_size} is a common divisor of "
+                f"the number of heads used in cross_attention: {head_dims}"
             )
-        if slice_size is not None and slice_size > self.attn_num_head_channels:
+        if slice_size is not None and slice_size > min(head_dims):
             raise ValueError(
-                f"Chunk_size {slice_size} has to be smaller or equal to "
-                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+                f"slice_size {slice_size} has to be smaller or equal to "
+                f"the lowest number of heads used in cross_attention: min({head_dims}) = {min(head_dims)}"
             )
 
         for attn in self.attentions:
@@ -954,15 +958,17 @@ def __init__(
         self.gradient_checkpointing = False
 
     def set_attention_slice(self, slice_size):
-        if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
+        head_dims = self.attn_num_head_channels
+        head_dims = [head_dims] if isinstance(head_dims, int) else head_dims
+        if slice_size is not None and any(dim % slice_size != 0 for dim in head_dims):
             raise ValueError(
-                f"Make sure slice_size {slice_size} is a divisor of "
-                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+                f"Make sure slice_size {slice_size} is a common divisor of "
+                f"the number of heads used in cross_attention: {head_dims}"
             )
-        if slice_size is not None and slice_size > self.attn_num_head_channels:
+        if slice_size is not None and slice_size > min(head_dims):
             raise ValueError(
-                f"Chunk_size {slice_size} has to be smaller or equal to "
-                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+                f"slice_size {slice_size} has to be smaller or equal to "
+                f"the lowest number of heads used in cross_attention: min({head_dims}) = {min(head_dims)}"
             )
 
         for attn in self.attentions:
@@ -1101,15 +1107,17 @@ def __init__(
         self.resnets = nn.ModuleList(resnets)
 
     def set_attention_slice(self, slice_size):
-        if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
+        head_dims = self.attn_num_head_channels
+        head_dims = [head_dims] if isinstance(head_dims, int) else head_dims
+        if slice_size is not None and any(dim % slice_size != 0 for dim in head_dims):
             raise ValueError(
-                f"Make sure slice_size {slice_size} is a divisor of "
-                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+                f"Make sure slice_size {slice_size} is a common divisor of "
+                f"the number of heads used in cross_attention: {head_dims}"
             )
-        if slice_size is not None and slice_size > self.attn_num_head_channels:
+        if slice_size is not None and slice_size > min(head_dims):
             raise ValueError(
-                f"Chunk_size {slice_size} has to be smaller or equal to "
-                f"the number of heads used in cross_attention {self.attn_num_head_channels}"
+                f"slice_size {slice_size} has to be smaller or equal to "
+                f"the lowest number of heads used in cross_attention: min({head_dims}) = {min(head_dims)}"
             )
 
         for attn in self.attentions:
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
index 2e150ca89787..e0c0273b615a 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
@@ -178,9 +178,14 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto
                 `attention_head_dim` must be a multiple of `slice_size`.
         """
         if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.image_unet.config.attention_head_dim // 2
+            if isinstance(self.image_unet.config.attention_head_dim, int):
+                # half the attention head size is usually a good trade-off between
+                # speed and memory
+                slice_size = self.image_unet.config.attention_head_dim // 2
+            else:
+                # if `attention_head_dim` is a list, take the smallest head size
+                slice_size = min(self.image_unet.config.attention_head_dim)
+
         self.image_unet.set_attention_slice(slice_size)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_attention_slicing
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
index 25947570628e..3e51ce6371f0 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
@@ -108,9 +108,14 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto
                 `attention_head_dim` must be a multiple of `slice_size`.
         """
         if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.image_unet.config.attention_head_dim // 2
+            if isinstance(self.image_unet.config.attention_head_dim, int):
+                # half the attention head size is usually a good trade-off between
+                # speed and memory
+                slice_size = self.image_unet.config.attention_head_dim // 2
+            else:
+                # if `attention_head_dim` is a list, take the smallest head size
+                slice_size = min(self.image_unet.config.attention_head_dim)
+
         self.image_unet.set_attention_slice(slice_size)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_attention_slicing
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
index 4845d5cab5ec..1ca57edf91b0 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
@@ -131,9 +131,14 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto
                 `attention_head_dim` must be a multiple of `slice_size`.
         """
         if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = self.image_unet.config.attention_head_dim // 2
+            if isinstance(self.image_unet.config.attention_head_dim, int):
+                # half the attention head size is usually a good trade-off between
+                # speed and memory
+                slice_size = self.image_unet.config.attention_head_dim // 2
+            else:
+                # if `attention_head_dim` is a list, take the smallest head size
+                slice_size = min(self.image_unet.config.attention_head_dim)
+
         self.image_unet.set_attention_slice(slice_size)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_attention_slicing
diff --git a/tests/pipelines/stable_diffusion_2/__init__.py b/tests/pipelines/stable_diffusion_2/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
new file mode 100644
index 000000000000..4702926e54aa
--- /dev/null
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
@@ -0,0 +1,740 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import tempfile
+import time
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+    logging,
+)
+from diffusers.utils import load_numpy, slow, torch_device
+from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
+from transformers import CLIPFeatureExtractor, CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class StableDiffusion2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @property
+    def dummy_cond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            # SD2-specific config below
+            attention_head_dim=(2, 4, 8, 8),
+            use_linear_projection=True,
+        )
+        return model
+
+    @property
+    def dummy_vae(self):
+        torch.manual_seed(0)
+        model = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        return model
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=512,
+        )
+        return CLIPTextModel(config)
+
+    @property
+    def dummy_extractor(self):
+        def extract(*args, **kwargs):
+            class Out:
+                def __init__(self):
+                    self.pixel_values = torch.ones([0])
+
+                def to(self, device):
+                    self.pixel_values.to(device)
+                    return self
+
+            return Out()
+
+        return extract
+
+    def test_save_pretrained_from_pretrained(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        feature_extractor = CLIPFeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=feature_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+        image = output.images
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            sd_pipe.save_pretrained(tmpdirname)
+            sd_pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
+        sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        generator = generator.manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+        new_image = output.images
+
+        assert np.abs(image - new_image).sum() < 1e-5, "Models don't have the same forward pass"
+
+    def test_stable_diffusion_ddim(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+        )
+
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5649, 0.6022, 0.4804, 0.5270, 0.5585, 0.4643, 0.5159, 0.4963, 0.4793])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_pndm(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.5099, 0.5677, 0.4671, 0.5128, 0.5697, 0.4676, 0.5277, 0.4964, 0.4946])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_k_lms(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4717, 0.5376, 0.4568, 0.5225, 0.5734, 0.4797, 0.5467, 0.5074, 0.5043])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_k_euler_ancestral(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = EulerAncestralDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4715, 0.5376, 0.4569, 0.5224, 0.5734, 0.4797, 0.5465, 0.5074, 0.5046])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_k_euler(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = EulerDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4717, 0.5376, 0.4568, 0.5225, 0.5734, 0.4797, 0.5467, 0.5074, 0.5043])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_attention_chunk(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output_1 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        # make sure chunking the attention yields the same result
+        sd_pipe.enable_attention_slicing(slice_size=1)
+        generator = torch.Generator(device=device).manual_seed(0)
+        output_2 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 1e-4
+
+    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
+    def test_stable_diffusion_fp16(self):
+        """Test that stable diffusion works with fp16"""
+        unet = self.dummy_cond_unet
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # put models in fp16
+        unet = unet.half()
+        vae = vae.half()
+        bert = bert.half()
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images
+
+        assert image.shape == (1, 64, 64, 3)
+
+    def test_stable_diffusion_long_prompt(self):
+        unet = self.dummy_cond_unet
+        scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=self.dummy_extractor,
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        do_classifier_free_guidance = True
+        negative_prompt = None
+        num_images_per_prompt = 1
+        logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
+
+        prompt = 25 * "@"
+        with CaptureLogger(logger) as cap_logger_3:
+            text_embeddings_3 = sd_pipe._encode_prompt(
+                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+
+        prompt = 100 * "@"
+        with CaptureLogger(logger) as cap_logger:
+            text_embeddings = sd_pipe._encode_prompt(
+                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+
+        negative_prompt = "Hello"
+        with CaptureLogger(logger) as cap_logger_2:
+            text_embeddings_2 = sd_pipe._encode_prompt(
+                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+            )
+
+        assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
+        assert text_embeddings.shape[1] == 77
+
+        assert cap_logger.out == cap_logger_2.out
+        # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25
+        assert cap_logger.out.count("@") == 25
+        assert cap_logger_3.out == ""
+
+
+@slow
+@require_torch_gpu
+class StableDiffusion2PipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_diffusion(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=20, output_type="np")
+
+        image = output.images
+        image_slice = image[0, 253:256, 253:256, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0788, 0.0823, 0.1091, 0.1165, 0.1263, 0.1459, 0.1317, 0.1507, 0.1551])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_ddim(self):
+        scheduler = DDIMScheduler.from_pretrained("stabilityai/stable-diffusion-2-base", subfolder="scheduler")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base", scheduler=scheduler)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+
+        output = sd_pipe([prompt], generator=generator, num_inference_steps=5, output_type="numpy")
+        image = output.images
+
+        image_slice = image[0, 253:256, 253:256, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0642, 0.0382, 0.0408, 0.0395, 0.0227, 0.0942, 0.0749, 0.0669, 0.0248])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_k_lms(self):
+        scheduler = LMSDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-2-base", subfolder="scheduler")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base", scheduler=scheduler)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "a photograph of an astronaut riding a horse"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = sd_pipe(
+            [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=5, output_type="numpy"
+        ).images
+
+        image_slice = image[0, 253:256, 253:256, -1]
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0548, 0.0626, 0.0612, 0.0611, 0.0706, 0.0586, 0.0843, 0.0333, 0.1197])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_memory_chunking(self):
+        torch.cuda.reset_peak_memory_stats()
+        model_id = "stabilityai/stable-diffusion-2-base"
+        pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "a photograph of an astronaut riding a horse"
+
+        # make attention efficient
+        pipe.enable_attention_slicing()
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        with torch.autocast(torch_device):
+            output_chunked = pipe(
+                [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
+            )
+            image_chunked = output_chunked.images
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+        # make sure that less than 3.75 GB is allocated
+        assert mem_bytes < 3.75 * 10**9
+
+        # disable chunking
+        pipe.disable_attention_slicing()
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        with torch.autocast(torch_device):
+            output = pipe(
+                [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
+            )
+            image = output.images
+
+        # make sure that more than 3.75 GB is allocated
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes > 3.75 * 10**9
+        assert np.abs(image_chunked.flatten() - image.flatten()).max() < 1e-3
+
+    def test_stable_diffusion_text2img_pipeline_fp16(self):
+        torch.cuda.reset_peak_memory_stats()
+        model_id = "stabilityai/stable-diffusion-2-base"
+        pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "a photograph of an astronaut riding a horse"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output_chunked = pipe(
+            [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
+        )
+        image_chunked = output_chunked.images
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        with torch.autocast(torch_device):
+            output = pipe(
+                [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
+            )
+            image = output.images
+
+        # Make sure results are close enough
+        diff = np.abs(image_chunked.flatten() - image.flatten())
+        # They ARE different since ops are not run always at the same precision
+        # however, they should be extremely close.
+        assert diff.mean() < 2e-2
+
+    def test_stable_diffusion_text2img_pipeline_default(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-text2img/astronaut_riding_a_horse.npy"
+        )
+
+        model_id = "stabilityai/stable-diffusion-2-base"
+        pipe = StableDiffusionPipeline.from_pretrained(model_id, safety_checker=None)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "astronaut riding a horse"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(prompt=prompt, strength=0.75, guidance_scale=7.5, generator=generator, output_type="np")
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 5e-3
+
+    def test_stable_diffusion_text2img_intermediate_state(self):
+        number_of_steps = 0
+
+        def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
+            test_callback_fn.has_been_called = True
+            nonlocal number_of_steps
+            number_of_steps += 1
+            if step == 0:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array([1.8606, 1.3169, -0.0691, 1.2374, -2.309, 1.077, -0.1084, -0.6774, -2.9594])
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
+            elif step == 20:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 64, 64)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array([1.078, 1.1804, 1.1339, 0.4664, -0.2354, 0.6097, -0.7749, -0.8784, -0.9465])
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-2
+
+        test_callback_fn.has_been_called = False
+
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2-base", revision="fp16", torch_dtype=torch.float16
+        )
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "Andromeda galaxy in a bottle"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        with torch.autocast(torch_device):
+            pipe(
+                prompt=prompt,
+                num_inference_steps=20,
+                guidance_scale=7.5,
+                generator=generator,
+                callback=test_callback_fn,
+                callback_steps=1,
+            )
+        assert test_callback_fn.has_been_called
+        assert number_of_steps == 21
+
+    def test_stable_diffusion_low_cpu_mem_usage(self):
+        pipeline_id = "stabilityai/stable-diffusion-2-base"
+
+        start_time = time.time()
+        pipeline_low_cpu_mem_usage = StableDiffusionPipeline.from_pretrained(
+            pipeline_id, revision="fp16", torch_dtype=torch.float16
+        )
+        pipeline_low_cpu_mem_usage.to(torch_device)
+        low_cpu_mem_usage_time = time.time() - start_time
+
+        start_time = time.time()
+        _ = StableDiffusionPipeline.from_pretrained(
+            pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, low_cpu_mem_usage=False
+        )
+        normal_load_time = time.time() - start_time
+
+        assert 2 * low_cpu_mem_usage_time < normal_load_time
+
+    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        pipeline_id = "stabilityai/stable-diffusion-2-base"
+        prompt = "Andromeda galaxy in a bottle"
+
+        pipeline = StableDiffusionPipeline.from_pretrained(pipeline_id, revision="fp16", torch_dtype=torch.float16)
+        pipeline = pipeline.to(torch_device)
+        pipeline.enable_attention_slicing(1)
+        pipeline.enable_sequential_cpu_offload()
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        _ = pipeline(prompt, generator=generator, num_inference_steps=5)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 2.8 GB is allocated
+        assert mem_bytes < 2.8 * 10**9

From 5c10e68a1feca15bfdabdbdd24c207af6bc099ce Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Fri, 25 Nov 2022 11:25:49 +0100
Subject: [PATCH 251/269] Add SD2 inpainting integration tests (#1412)

SD2 inpainting integration tests
---
 .../test_stable_diffusion_inpaint.py          | 345 ++++++++++++++++++
 1 file changed, 345 insertions(+)
 create mode 100644 tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py

diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
new file mode 100644
index 000000000000..b420570f0707
--- /dev/null
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
@@ -0,0 +1,345 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import AutoencoderKL, PNDMScheduler, StableDiffusionInpaintPipeline, UNet2DConditionModel
+from diffusers.utils import floats_tensor, load_image, load_numpy, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @property
+    def dummy_image(self):
+        batch_size = 1
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
+        return image
+
+    @property
+    def dummy_cond_unet_inpaint(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=9,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            # SD2-specific config below
+            attention_head_dim=(2, 4, 8, 8),
+            use_linear_projection=True,
+        )
+        return model
+
+    @property
+    def dummy_vae(self):
+        torch.manual_seed(0)
+        model = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        return model
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=512,
+        )
+        return CLIPTextModel(config)
+
+    @property
+    def dummy_extractor(self):
+        def extract(*args, **kwargs):
+            class Out:
+                def __init__(self):
+                    self.pixel_values = torch.ones([0])
+
+                def to(self, device):
+                    self.pixel_values.to(device)
+                    return self
+
+            return Out()
+
+        return extract
+
+    def test_stable_diffusion_inpaint(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet_inpaint
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        text_encoder = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionInpaintPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=None,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            image=init_image,
+            mask_image=mask_image,
+        )
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            image=init_image,
+            mask_image=mask_image,
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4727, 0.5735, 0.3941, 0.5446, 0.5926, 0.4394, 0.5062, 0.4654, 0.4476])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
+    def test_stable_diffusion_inpaint_fp16(self):
+        """Test that stable diffusion inpaint works with fp16"""
+        unet = self.dummy_cond_unet_inpaint
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        vae = self.dummy_vae
+        text_encoder = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
+
+        # put models in fp16
+        unet = unet.half()
+        vae = vae.half()
+        text_encoder = text_encoder.half()
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionInpaintPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=None,
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = sd_pipe(
+            [prompt],
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+            image=init_image,
+            mask_image=mask_image,
+        ).images
+
+        assert image.shape == (1, 64, 64, 3)
+
+
+# @slow
+@require_torch_gpu
+class StableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_diffusion_inpaint_pipeline(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/sd2-inpaint/init_image.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/mask.png"
+        )
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint"
+            "/yellow_cat_sitting_on_a_park_bench.npy"
+        )
+
+        model_id = "stabilityai/stable-diffusion-2-inpainting"
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            mask_image=mask_image,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 1e-3
+
+    def test_stable_diffusion_inpaint_pipeline_fp16(self):
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/sd2-inpaint/init_image.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/mask.png"
+        )
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint"
+            "/yellow_cat_sitting_on_a_park_bench_fp16.npy"
+        )
+
+        model_id = "stabilityai/stable-diffusion-2-inpainting"
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            model_id,
+            revision="fp16",
+            torch_dtype=torch.float16,
+            safety_checker=None,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            image=init_image,
+            mask_image=mask_image,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 5e-1
+
+    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        init_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/sd2-inpaint/init_image.png"
+        )
+        mask_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/mask.png"
+        )
+
+        model_id = "stabilityai/stable-diffusion-2-inpainting"
+        pndm = PNDMScheduler.from_pretrained(model_id, subfolder="scheduler")
+        pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            model_id,
+            safety_checker=None,
+            scheduler=pndm,
+            device_map="auto",
+            revision="fp16",
+            torch_dtype=torch.float16,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing(1)
+        pipe.enable_sequential_cpu_offload()
+
+        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        _ = pipe(
+            prompt=prompt,
+            image=init_image,
+            mask_image=mask_image,
+            generator=generator,
+            num_inference_steps=5,
+            output_type="np",
+        )
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 2.65 GB is allocated
+        assert mem_bytes < 2.65 * 10**9

From 9f10c545cbf54dd4d87e7e0f24e1ec02e928c966 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 25 Nov 2022 11:26:27 +0100
Subject: [PATCH 252/269] Fix sample size conversion script (#1408)

up
---
 ..._original_stable_diffusion_to_diffusers.py |  3 +-
 v1-inference.yaml                             | 70 -------------------
 2 files changed, 2 insertions(+), 71 deletions(-)
 delete mode 100644 v1-inference.yaml

diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py
index 375b12b6f88b..2d354df93818 100644
--- a/scripts/convert_original_stable_diffusion_to_diffusers.py
+++ b/scripts/convert_original_stable_diffusion_to_diffusers.py
@@ -211,6 +211,7 @@ def create_unet_diffusers_config(original_config):
     """
     Creates a config for the diffusers based on the config of the LDM model.
     """
+    model_params = original_config.model.params
     unet_params = original_config.model.params.unet_config.params
 
     block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
@@ -230,7 +231,7 @@ def create_unet_diffusers_config(original_config):
         resolution //= 2
 
     config = dict(
-        sample_size=unet_params.image_size,
+        sample_size=model_params.image_size,
         in_channels=unet_params.in_channels,
         out_channels=unet_params.out_channels,
         down_block_types=tuple(down_block_types),
diff --git a/v1-inference.yaml b/v1-inference.yaml
deleted file mode 100644
index d4effe569e89..000000000000
--- a/v1-inference.yaml
+++ /dev/null
@@ -1,70 +0,0 @@
-model:
-  base_learning_rate: 1.0e-04
-  target: ldm.models.diffusion.ddpm.LatentDiffusion
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false   # Note: different from the one we trained before
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-
-    scheduler_config: # 10000 warmup steps
-      target: ldm.lr_scheduler.LambdaLinearScheduler
-      params:
-        warm_up_steps: [ 10000 ]
-        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
-        f_start: [ 1.e-6 ]
-        f_max: [ 1. ]
-        f_min: [ 1. ]
-
-    unet_config:
-      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

From f26cde3dff6b288b4c6e5c84a287373aa8c8a689 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 25 Nov 2022 12:04:40 +0100
Subject: [PATCH 253/269] fix clip guided (#1414)

---
 examples/community/clip_guided_stable_diffusion.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/community/clip_guided_stable_diffusion.py b/examples/community/clip_guided_stable_diffusion.py
index 14d9ee63226d..d0230ab0f3af 100644
--- a/examples/community/clip_guided_stable_diffusion.py
+++ b/examples/community/clip_guided_stable_diffusion.py
@@ -78,7 +78,8 @@ def __init__(
         )
 
         self.normalize = transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
-        self.make_cutouts = MakeCutouts(feature_extractor.size)
+        cut_out_size = feature_extractor.size if isinstance(feature_extractor.size, int) else feature_extractor.size["shortest_edge"]
+        self.make_cutouts = MakeCutouts(cut_out_size)
 
         set_requires_grad(self.text_encoder, False)
         set_requires_grad(self.clip_model, False)

From 29021090614641d2509155ca0021497896228999 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 25 Nov 2022 12:53:10 +0100
Subject: [PATCH 254/269] Fix all stable diffusion (#1415)

* up

* uP
---
 .../community/clip_guided_stable_diffusion.py    |  6 +++++-
 .../alt_diffusion/pipeline_alt_diffusion.py      |  7 ++++++-
 .../pipeline_alt_diffusion_img2img.py            |  7 ++++++-
 .../stable_diffusion/pipeline_cycle_diffusion.py |  7 ++++++-
 .../pipeline_stable_diffusion.py                 |  7 ++++++-
 .../pipeline_stable_diffusion_img2img.py         |  7 ++++++-
 .../pipeline_stable_diffusion_inpaint.py         |  7 ++++++-
 .../pipeline_stable_diffusion_inpaint_legacy.py  |  7 ++++++-
 .../stable_diffusion/test_stable_diffusion.py    |  2 +-
 .../stable_diffusion_2/test_stable_diffusion.py  | 16 ++++++++--------
 10 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/examples/community/clip_guided_stable_diffusion.py b/examples/community/clip_guided_stable_diffusion.py
index d0230ab0f3af..7a319bddf053 100644
--- a/examples/community/clip_guided_stable_diffusion.py
+++ b/examples/community/clip_guided_stable_diffusion.py
@@ -78,7 +78,11 @@ def __init__(
         )
 
         self.normalize = transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
-        cut_out_size = feature_extractor.size if isinstance(feature_extractor.size, int) else feature_extractor.size["shortest_edge"]
+        cut_out_size = (
+            feature_extractor.size
+            if isinstance(feature_extractor.size, int)
+            else feature_extractor.size["shortest_edge"]
+        )
         self.make_cutouts = MakeCutouts(cut_out_size)
 
         set_requires_grad(self.text_encoder, False)
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index 893174a8692a..fd272de88086 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -229,10 +229,15 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         device = torch.device(f"cuda:{gpu_id}")
 
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
+        if self.safety_checker is not None:
+            # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
+            # fix by only offloading self.safety_checker for now
+            cpu_offload(self.safety_checker.vision_model)
+
     @property
     def _execution_device(self):
         r"""
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index f7baedde9813..75f10b910f23 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -224,10 +224,15 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         device = torch.device(f"cuda:{gpu_id}")
 
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
+        if self.safety_checker is not None:
+            # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
+            # fix by only offloading self.safety_checker for now
+            cpu_offload(self.safety_checker.vision_model)
+
     @property
     def _execution_device(self):
         r"""
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index 287fd74b64a1..c191e67ee000 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -257,10 +257,15 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         device = torch.device(f"cuda:{gpu_id}")
 
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
+        if self.safety_checker is not None:
+            # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
+            # fix by only offloading self.safety_checker for now
+            cpu_offload(self.safety_checker.vision_model)
+
     @property
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
     def _execution_device(self):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index c9f96fca0bbe..403923d820b1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -228,10 +228,15 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         device = torch.device(f"cuda:{gpu_id}")
 
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
+        if self.safety_checker is not None:
+            # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
+            # fix by only offloading self.safety_checker for now
+            cpu_offload(self.safety_checker.vision_model)
+
     @property
     def _execution_device(self):
         r"""
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index d86847fad653..493ef4b0b01d 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -226,10 +226,15 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         device = torch.device(f"cuda:{gpu_id}")
 
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
+        if self.safety_checker is not None:
+            # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
+            # fix by only offloading self.safety_checker for now
+            cpu_offload(self.safety_checker.vision_model)
+
     @property
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
     def _execution_device(self):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 6fee298bc47b..23fbf512f7c6 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -291,10 +291,15 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         device = torch.device(f"cuda:{gpu_id}")
 
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
+        if self.safety_checker is not None:
+            # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
+            # fix by only offloading self.safety_checker for now
+            cpu_offload(self.safety_checker.vision_model)
+
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention
     def enable_xformers_memory_efficient_attention(self):
         r"""
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index e1e5a33bd4ba..adcfc493aae4 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -239,10 +239,15 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
 
         device = torch.device(f"cuda:{gpu_id}")
 
-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
             if cpu_offloaded_model is not None:
                 cpu_offload(cpu_offloaded_model, device)
 
+        if self.safety_checker is not None:
+            # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
+            # fix by only offloading self.safety_checker for now
+            cpu_offload(self.safety_checker.vision_model)
+
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention
     def enable_xformers_memory_efficient_attention(self):
         r"""
diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index 0efcb9ad8839..e2e27a211d88 100644
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -948,7 +948,7 @@ def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> No
                 expected_slice = np.array(
                     [1.8285, 1.2857, -0.1024, 1.2406, -2.3068, 1.0747, -0.0818, -0.6520, -2.9506]
                 )
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-3
             elif step == 50:
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 64)
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
index 4702926e54aa..e1d22662cd83 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
@@ -609,11 +609,12 @@ def test_stable_diffusion_memory_chunking(self):
         assert mem_bytes > 3.75 * 10**9
         assert np.abs(image_chunked.flatten() - image.flatten()).max() < 1e-3
 
-    def test_stable_diffusion_text2img_pipeline_fp16(self):
+    def test_stable_diffusion_same_quality(self):
         torch.cuda.reset_peak_memory_stats()
         model_id = "stabilityai/stable-diffusion-2-base"
         pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16)
         pipe = pipe.to(torch_device)
+        pipe.enable_attention_slicing()
         pipe.set_progress_bar_config(disable=None)
 
         prompt = "a photograph of an astronaut riding a horse"
@@ -624,18 +625,17 @@ def test_stable_diffusion_text2img_pipeline_fp16(self):
         )
         image_chunked = output_chunked.images
 
+        pipe = StableDiffusionPipeline.from_pretrained(model_id)
+        pipe = pipe.to(torch_device)
         generator = torch.Generator(device=torch_device).manual_seed(0)
-        with torch.autocast(torch_device):
-            output = pipe(
-                [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
-            )
-            image = output.images
+        output = pipe([prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy")
+        image = output.images
 
         # Make sure results are close enough
         diff = np.abs(image_chunked.flatten() - image.flatten())
         # They ARE different since ops are not run always at the same precision
         # however, they should be extremely close.
-        assert diff.mean() < 2e-2
+        assert diff.mean() < 5e-2
 
     def test_stable_diffusion_text2img_pipeline_default(self):
         expected_image = load_numpy(
@@ -669,7 +669,7 @@ def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> No
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
                 expected_slice = np.array([1.8606, 1.3169, -0.0691, 1.2374, -2.309, 1.077, -0.1084, -0.6774, -2.9594])
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-3
             elif step == 20:
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 64)

From 2c6bc0f13ba2ba609ac141022b4b56b677d74943 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 25 Nov 2022 12:04:15 +0000
Subject: [PATCH 255/269] small fix

---
 src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py | 2 +-
 .../pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py   | 2 +-
 .../pipelines/stable_diffusion/pipeline_cycle_diffusion.py      | 2 +-
 .../pipelines/stable_diffusion/pipeline_stable_diffusion.py     | 2 +-
 .../stable_diffusion/pipeline_stable_diffusion_img2img.py       | 2 +-
 .../stable_diffusion/pipeline_stable_diffusion_inpaint.py       | 2 +-
 .../pipeline_stable_diffusion_inpaint_legacy.py                 | 2 +-
 .../test_versatile_diffusion_image_variation.py                 | 1 +
 8 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
index fd272de88086..3bbc3b3fd7ff 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py
@@ -236,7 +236,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
         if self.safety_checker is not None:
             # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
             # fix by only offloading self.safety_checker for now
-            cpu_offload(self.safety_checker.vision_model)
+            cpu_offload(self.safety_checker.vision_model, device)
 
     @property
     def _execution_device(self):
diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
index 75f10b910f23..23b4b42b5899 100644
--- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -231,7 +231,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
         if self.safety_checker is not None:
             # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
             # fix by only offloading self.safety_checker for now
-            cpu_offload(self.safety_checker.vision_model)
+            cpu_offload(self.safety_checker.vision_model, device)
 
     @property
     def _execution_device(self):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
index c191e67ee000..83848905fd4c 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py
@@ -264,7 +264,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
         if self.safety_checker is not None:
             # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
             # fix by only offloading self.safety_checker for now
-            cpu_offload(self.safety_checker.vision_model)
+            cpu_offload(self.safety_checker.vision_model, device)
 
     @property
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 403923d820b1..3739ae7a6da0 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -235,7 +235,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
         if self.safety_checker is not None:
             # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
             # fix by only offloading self.safety_checker for now
-            cpu_offload(self.safety_checker.vision_model)
+            cpu_offload(self.safety_checker.vision_model, device)
 
     @property
     def _execution_device(self):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 493ef4b0b01d..8fe86992af20 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -233,7 +233,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
         if self.safety_checker is not None:
             # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
             # fix by only offloading self.safety_checker for now
-            cpu_offload(self.safety_checker.vision_model)
+            cpu_offload(self.safety_checker.vision_model, device)
 
     @property
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 23fbf512f7c6..8cefffbb8eb1 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -298,7 +298,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
         if self.safety_checker is not None:
             # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
             # fix by only offloading self.safety_checker for now
-            cpu_offload(self.safety_checker.vision_model)
+            cpu_offload(self.safety_checker.vision_model, device)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention
     def enable_xformers_memory_efficient_attention(self):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
index adcfc493aae4..1d2c939fef49 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py
@@ -246,7 +246,7 @@ def enable_sequential_cpu_offload(self, gpu_id=0):
         if self.safety_checker is not None:
             # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
             # fix by only offloading self.safety_checker for now
-            cpu_offload(self.safety_checker.vision_model)
+            cpu_offload(self.safety_checker.vision_model, device)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention
     def enable_xformers_memory_efficient_attention(self):
diff --git a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
index 4eddc271db52..f8901e287c1b 100644
--- a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
+++ b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
@@ -54,5 +54,6 @@ def test_inference_image_variations(self):
         image_slice = image[0, 253:256, 253:256, -1]
 
         assert image.shape == (1, 512, 512, 3)
+        print(torch.from_numpy(image_slice.flatten()))
         expected_slice = np.array([0.0113, 0.2241, 0.4024, 0.0839, 0.0871, 0.2725, 0.2581, 0.0, 0.1096])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

From 35099b207ecd08c6beded93ad8dded9d09abf908 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 25 Nov 2022 13:40:41 +0100
Subject: [PATCH 256/269] [Versatile Diffusion] Fix remaining tests (#1418)

fix all tests
---
 .../pipeline_versatile_diffusion_dual_guided.py            | 4 ++++
 .../pipeline_versatile_diffusion_text_to_image.py          | 2 ++
 .../test_versatile_diffusion_image_variation.py            | 3 +--
 .../versatile_diffusion/test_versatile_diffusion_mega.py   | 7 +++----
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
index e0c0273b615a..fa1754a4f062 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
@@ -65,6 +65,8 @@ class VersatileDiffusionDualGuidedPipeline(DiffusionPipeline):
     vae: AutoencoderKL
     scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
 
+    _optional_components = ["text_unet"]
+
     def __init__(
         self,
         tokenizer: CLIPTokenizer,
@@ -143,6 +145,8 @@ def _revert_dual_attention(self):
                 index = int(index)
                 self.image_unet.get_submodule(parent_name)[index] = module.transformers[0]
 
+        self.image_unet.register_to_config(dual_cross_attention=False)
+
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention with unet->image_unet
     def enable_xformers_memory_efficient_attention(self):
         r"""
diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
index 1ca57edf91b0..e77f5a2f22e4 100644
--- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
+++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
@@ -57,6 +57,8 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
     vae: AutoencoderKL
     scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
 
+    _optional_components = ["text_unet"]
+
     def __init__(
         self,
         tokenizer: CLIPTokenizer,
diff --git a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
index f8901e287c1b..1711b752992f 100644
--- a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
+++ b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
@@ -54,6 +54,5 @@ def test_inference_image_variations(self):
         image_slice = image[0, 253:256, 253:256, -1]
 
         assert image.shape == (1, 512, 512, 3)
-        print(torch.from_numpy(image_slice.flatten()))
-        expected_slice = np.array([0.0113, 0.2241, 0.4024, 0.0839, 0.0871, 0.2725, 0.2581, 0.0, 0.1096])
+        expected_slice = np.array([0.1205, 0.1914, 0.2289, 0.0883, 0.1595, 0.1683, 0.0703, 0.1493, 0.1298])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
index 1209abf6a8db..c69799c9d482 100644
--- a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
+++ b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
@@ -104,7 +104,7 @@ def test_inference_dual_guided_then_text_to_image(self):
         image_slice = image[0, 253:256, 253:256, -1]
 
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.014, 0.0112, 0.0136, 0.0145, 0.0107, 0.0113, 0.0272, 0.0215, 0.0216])
+        expected_slice = np.array([0.0081, 0.0032, 0.0002, 0.0056, 0.0027, 0.0000, 0.0051, 0.0020, 0.0007])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
         prompt = "A painting of a squirrel eating a burger "
@@ -119,11 +119,10 @@ def test_inference_dual_guided_then_text_to_image(self):
         expected_slice = np.array([0.0408, 0.0181, 0.0, 0.0388, 0.0046, 0.0461, 0.0411, 0.0, 0.0222])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
-        pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion", torch_dtype=torch.float16)
-        image = pipe.image_variation(init_image, generator=generator, output_type="numpy").images[0]
+        image = pipe.image_variation(init_image, generator=generator, output_type="numpy").images
 
         image_slice = image[0, 253:256, 253:256, -1]
 
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.0657, 0.0529, 0.0455, 0.0802, 0.0570, 0.0179, 0.0267, 0.0483, 0.0769])
+        expected_slice = np.array([0.3479, 0.1943, 0.1060, 0.3894, 0.2537, 0.1394, 0.3989, 0.3191, 0.1987])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

From babfb8a020778acffd48c5e08968c6570f02fa1d Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 25 Nov 2022 13:59:56 +0100
Subject: [PATCH 257/269] [MPS] call contiguous after permute (#1411)

* call contiguous after permute

Fixes for MPS device

* Fix MPS UserWarning

* make style

* Revert "Fix MPS UserWarning"

This reverts commit b46c32810ee5fdc4c16a8e9224a826490b66cf49.
---
 src/diffusers/models/attention.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 4c970d062d64..e9454a467af1 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -221,11 +221,15 @@ def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, retu
         # 3. Output
         if self.is_input_continuous:
             if not self.use_linear_projection:
-                hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2)
+                hidden_states = (
+                    hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+                )
                 hidden_states = self.proj_out(hidden_states)
             else:
                 hidden_states = self.proj_out(hidden_states)
-                hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2)
+                hidden_states = (
+                    hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+                )
 
             output = hidden_states + residual
         elif self.is_input_vectorized:

From d52388f48660de5776d9129945d5e960cad59d63 Mon Sep 17 00:00:00 2001
From: Pedro Cuenca <pedro@huggingface.co>
Date: Fri, 25 Nov 2022 14:02:15 +0100
Subject: [PATCH 258/269] Deprecate `predict_epsilon` (#1393)

* Adapt ddpm, ddpmsolver to prediction_type.

* Deprecate predict_epsilon in __init__.

* Bring FlaxDDIMScheduler up to date with DDIMScheduler.

* Set prediction_type as an ivar for consistency.

* Convert pipeline_ddpm

* Adapt tests.

* Adapt unconditional training script.

* Adapt BitDiffusion example.

* Add missing kwargs in dpmsolver_multistep

* Ugly workaround to accept deprecated predict_epsilon when loading
schedulers using from_pretrained.

* make style

* Remove import no longer in use.

* Apply suggestions from code review

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

* Use config.prediction_type everywhere

* Add a couple of Flax prediction type tests.

* make style

* fix register deprecated arg

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 examples/community/bit_diffusion.py           | 12 +++---
 .../train_unconditional.py                    | 19 +++++----
 src/diffusers/configuration_utils.py          |  5 +++
 .../experimental/rl/value_guided_sampling.py  |  1 +
 src/diffusers/pipelines/ddpm/pipeline_ddpm.py | 10 ++---
 src/diffusers/schedulers/scheduling_ddim.py   | 24 ++++++++----
 .../schedulers/scheduling_ddim_flax.py        | 29 +++++++++++++-
 src/diffusers/schedulers/scheduling_ddpm.py   | 34 +++++++++++-----
 .../schedulers/scheduling_ddpm_flax.py        | 35 +++++++++++------
 .../scheduling_dpmsolver_multistep.py         | 39 ++++++++++++++-----
 .../scheduling_dpmsolver_multistep_flax.py    | 38 +++++++++++++-----
 .../schedulers/scheduling_euler_discrete.py   |  8 ++--
 tests/pipelines/ddpm/test_ddpm.py             | 31 ++++++++++++++-
 tests/test_config.py                          | 15 ++++++-
 tests/test_pipelines.py                       |  6 +--
 tests/test_scheduler.py                       | 19 +++++----
 tests/test_scheduler_flax.py                  | 22 ++++++++++-
 17 files changed, 260 insertions(+), 87 deletions(-)

diff --git a/examples/community/bit_diffusion.py b/examples/community/bit_diffusion.py
index c0be3a13ad8d..956e25a7e5c5 100644
--- a/examples/community/bit_diffusion.py
+++ b/examples/community/bit_diffusion.py
@@ -138,7 +138,7 @@ def ddpm_bit_scheduler_step(
     model_output: torch.FloatTensor,
     timestep: int,
     sample: torch.FloatTensor,
-    predict_epsilon=True,
+    prediction_type="epsilon",
     generator=None,
     return_dict: bool = True,
 ) -> Union[DDPMSchedulerOutput, Tuple]:
@@ -150,8 +150,8 @@ def ddpm_bit_scheduler_step(
         timestep (`int`): current discrete timestep in the diffusion chain.
         sample (`torch.FloatTensor`):
             current instance of sample being created by diffusion process.
-        predict_epsilon (`bool`):
-            optional flag to use when model predicts the samples directly instead of the noise, epsilon.
+        prediction_type (`str`, default `epsilon`):
+            indicates whether the model predicts the noise (epsilon), or the samples (`sample`).
         generator: random number generator.
         return_dict (`bool`): option for returning tuple rather than DDPMSchedulerOutput class
     Returns:
@@ -174,10 +174,12 @@ def ddpm_bit_scheduler_step(
 
     # 2. compute predicted original sample from predicted noise also called
     # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
-    if predict_epsilon:
+    if prediction_type == "epsilon":
         pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-    else:
+    elif prediction_type == "sample":
         pred_original_sample = model_output
+    else:
+        raise ValueError(f"Unsupported prediction_type {prediction_type}.")
 
     # 3. Clip "predicted x_0"
     scale = self.bit_scale
diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py
index 54a94d98b578..6abe46c57def 100644
--- a/examples/unconditional_image_generation/train_unconditional.py
+++ b/examples/unconditional_image_generation/train_unconditional.py
@@ -194,9 +194,10 @@ def parse_args():
     )
 
     parser.add_argument(
-        "--predict_epsilon",
-        action="store_true",
-        default=True,
+        "--prediction_type",
+        type=str,
+        default="epsilon",
+        choices=["epsilon", "sample"],
         help="Whether the model should predict the 'epsilon'/noise error or directly the reconstructed image 'x0'.",
     )
 
@@ -256,13 +257,13 @@ def main(args):
             "UpBlock2D",
         ),
     )
-    accepts_predict_epsilon = "predict_epsilon" in set(inspect.signature(DDPMScheduler.__init__).parameters.keys())
+    accepts_prediction_type = "prediction_type" in set(inspect.signature(DDPMScheduler.__init__).parameters.keys())
 
-    if accepts_predict_epsilon:
+    if accepts_prediction_type:
         noise_scheduler = DDPMScheduler(
             num_train_timesteps=args.ddpm_num_steps,
             beta_schedule=args.ddpm_beta_schedule,
-            predict_epsilon=args.predict_epsilon,
+            prediction_type=args.prediction_type,
         )
     else:
         noise_scheduler = DDPMScheduler(num_train_timesteps=args.ddpm_num_steps, beta_schedule=args.ddpm_beta_schedule)
@@ -365,9 +366,9 @@ def transforms(examples):
                 # Predict the noise residual
                 model_output = model(noisy_images, timesteps).sample
 
-                if args.predict_epsilon:
+                if args.prediction_type == "epsilon":
                     loss = F.mse_loss(model_output, noise)  # this could have different weights!
-                else:
+                elif args.prediction_type == "sample":
                     alpha_t = _extract_into_tensor(
                         noise_scheduler.alphas_cumprod, timesteps, (clean_images.shape[0], 1, 1, 1)
                     )
@@ -376,6 +377,8 @@ def transforms(examples):
                         model_output, clean_images, reduction="none"
                     )  # use SNR weighting from distillation paper
                     loss = loss.mean()
+                else:
+                    raise ValueError(f"Unsupported prediction type: {args.prediction_type}")
 
                 accelerator.backward(loss)
 
diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py
index eef901f8ff83..1a7499c6118d 100644
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -195,6 +195,11 @@ def from_config(cls, config: Union[FrozenDict, Dict[str, Any]] = None, return_un
         if "dtype" in unused_kwargs:
             init_dict["dtype"] = unused_kwargs.pop("dtype")
 
+        if "predict_epsilon" in unused_kwargs and "prediction_type" not in init_dict:
+            deprecate("remove this", "0.10.0", "remove")
+            predict_epsilon = unused_kwargs.pop("predict_epsilon")
+            init_dict["prediction_type"] = "epsilon" if predict_epsilon else "sample"
+
         # Return model and optionally state and/or unused_kwargs
         model = cls(**init_dict)
 
diff --git a/src/diffusers/experimental/rl/value_guided_sampling.py b/src/diffusers/experimental/rl/value_guided_sampling.py
index 8d5062e3d4c5..4dd935f54d60 100644
--- a/src/diffusers/experimental/rl/value_guided_sampling.py
+++ b/src/diffusers/experimental/rl/value_guided_sampling.py
@@ -89,6 +89,7 @@ def run_diffusion(self, x, conditions, n_guide_steps, scale):
                 x = x + scale * grad
                 x = self.reset_x0(x, conditions, self.action_dim)
             prev_x = self.unet(x.permute(0, 2, 1), timesteps).sample.permute(0, 2, 1)
+            # TODO: set prediction_type when instantiating the model
             x = self.scheduler.step(prev_x, i, x, predict_epsilon=False)["prev_sample"]
 
             # apply conditions to the trajectory
diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
index 634e1c0f99f6..31791caf9ebe 100644
--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -70,14 +70,14 @@ def __call__(
             generated images.
         """
         message = (
-            "Please make sure to instantiate your scheduler with `predict_epsilon` instead. E.g. `scheduler ="
-            " DDPMScheduler.from_pretrained(<model_id>, predict_epsilon=True)`."
+            "Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
+            " DDPMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
         )
         predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
 
         if predict_epsilon is not None:
             new_config = dict(self.scheduler.config)
-            new_config["predict_epsilon"] = predict_epsilon
+            new_config["prediction_type"] = "epsilon" if predict_epsilon else "sample"
             self.scheduler._internal_dict = FrozenDict(new_config)
 
         if generator is not None and generator.device.type != self.device.type and self.device.type != "mps":
@@ -114,9 +114,7 @@ def __call__(
             model_output = self.unet(image, t).sample
 
             # 2. compute previous image: x_t -> x_t-1
-            image = self.scheduler.step(
-                model_output, t, image, generator=generator, predict_epsilon=predict_epsilon
-            ).prev_sample
+            image = self.scheduler.step(model_output, t, image, generator=generator).prev_sample
 
         image = (image / 2 + 0.5).clamp(0, 1)
         image = image.cpu().permute(0, 2, 3, 1).numpy()
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 3e5ebfe0e8cd..b16716f0e676 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -23,7 +23,7 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS, BaseOutput
+from ..utils import _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS, BaseOutput, deprecate
 from .scheduling_utils import SchedulerMixin
 
 
@@ -106,6 +106,9 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
             an offset added to the inference steps. You can use a combination of `offset=1` and
             `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
             stable diffusion.
+        prediction_type (`str`, default `epsilon`):
+            indicates whether the model predicts the noise (epsilon), or the samples. One of `epsilon`, `sample`.
+            `v-prediction` is not supported for this scheduler.
 
     """
 
@@ -123,7 +126,16 @@ def __init__(
         set_alpha_to_one: bool = True,
         steps_offset: int = 0,
         prediction_type: str = "epsilon",
+        **kwargs,
     ):
+        message = (
+            "Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
+            " DDIMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
+        )
+        predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
+        if predict_epsilon is not None:
+            self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
+
         if trained_betas is not None:
             self.betas = torch.from_numpy(trained_betas)
         elif beta_schedule == "linear":
@@ -139,8 +151,6 @@ def __init__(
         else:
             raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
-        self.prediction_type = prediction_type
-
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
 
@@ -261,17 +271,17 @@ def step(
 
         # 3. compute predicted original sample from predicted noise also called
         # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        if self.prediction_type == "epsilon":
+        if self.config.prediction_type == "epsilon":
             pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-        elif self.prediction_type == "sample":
+        elif self.config.prediction_type == "sample":
             pred_original_sample = model_output
-        elif self.prediction_type == "v_prediction":
+        elif self.config.prediction_type == "v_prediction":
             pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
             # predict V
             model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
         else:
             raise ValueError(
-                f"prediction_type given as {self.prediction_type} must be one of `epsilon`, `sample`, or"
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
                 " `v_prediction`"
             )
 
diff --git a/src/diffusers/schedulers/scheduling_ddim_flax.py b/src/diffusers/schedulers/scheduling_ddim_flax.py
index ceef96a4a95f..122c36f291da 100644
--- a/src/diffusers/schedulers/scheduling_ddim_flax.py
+++ b/src/diffusers/schedulers/scheduling_ddim_flax.py
@@ -23,6 +23,7 @@
 import jax.numpy as jnp
 
 from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import deprecate
 from .scheduling_utils_flax import (
     _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS,
     FlaxSchedulerMixin,
@@ -108,6 +109,10 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
             an offset added to the inference steps. You can use a combination of `offset=1` and
             `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
             stable diffusion.
+        prediction_type (`str`, default `epsilon`):
+            indicates whether the model predicts the noise (epsilon), or the samples. One of `epsilon`, `sample`.
+            `v-prediction` is not supported for this scheduler.
+
     """
 
     _compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
@@ -125,7 +130,17 @@ def __init__(
         beta_schedule: str = "linear",
         set_alpha_to_one: bool = True,
         steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        **kwargs,
     ):
+        message = (
+            "Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
+            " FlaxDDIMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
+        )
+        predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
+        if predict_epsilon is not None:
+            self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
+
         if beta_schedule == "linear":
             self.betas = jnp.linspace(beta_start, beta_end, num_train_timesteps, dtype=jnp.float32)
         elif beta_schedule == "scaled_linear":
@@ -259,7 +274,19 @@ def step(
 
         # 3. compute predicted original sample from predicted noise also called
         # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            # predict V
+            model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction`"
+            )
 
         # 4. compute variance: "sigma_t(η)" -> see formula (16)
         # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index 299a06f4eb13..c691630a2b86 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -99,9 +99,9 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
             `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
         clip_sample (`bool`, default `True`):
             option to clip predicted sample between -1 and 1 for numerical stability.
-        predict_epsilon (`bool`):
-            optional flag to use when the model predicts the noise (epsilon), or the samples instead of the noise.
-
+        prediction_type (`str`, default `epsilon`):
+            indicates whether the model predicts the noise (epsilon), or the samples. One of `epsilon`, `sample`.
+            `v-prediction` is not supported for this scheduler.
     """
 
     _compatibles = _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
@@ -116,8 +116,17 @@ def __init__(
         trained_betas: Optional[np.ndarray] = None,
         variance_type: str = "fixed_small",
         clip_sample: bool = True,
-        predict_epsilon: bool = True,
+        prediction_type: str = "epsilon",
+        **kwargs,
     ):
+        message = (
+            "Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
+            " DDPMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
+        )
+        predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
+        if predict_epsilon is not None:
+            self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
+
         if trained_betas is not None:
             self.betas = torch.from_numpy(trained_betas)
         elif beta_schedule == "linear":
@@ -241,13 +250,13 @@ def step(
 
         """
         message = (
-            "Please make sure to instantiate your scheduler with `predict_epsilon` instead. E.g. `scheduler ="
-            " DDPMScheduler.from_pretrained(<model_id>, predict_epsilon=True)`."
+            "Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
+            " DDPMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
         )
         predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
-        if predict_epsilon is not None and predict_epsilon != self.config.predict_epsilon:
+        if predict_epsilon is not None:
             new_config = dict(self.config)
-            new_config["predict_epsilon"] = predict_epsilon
+            new_config["prediction_type"] = "epsilon" if predict_epsilon else "sample"
             self._internal_dict = FrozenDict(new_config)
 
         t = timestep
@@ -265,10 +274,15 @@ def step(
 
         # 2. compute predicted original sample from predicted noise also called
         # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
-        if self.config.predict_epsilon:
+        if self.config.prediction_type == "epsilon":
             pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-        else:
+        elif self.config.prediction_type == "sample":
             pred_original_sample = model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` "
+                " for the DDPMScheduler."
+            )
 
         # 3. Clip "predicted x_0"
         if self.config.clip_sample:
diff --git a/src/diffusers/schedulers/scheduling_ddpm_flax.py b/src/diffusers/schedulers/scheduling_ddpm_flax.py
index 480cbda73c65..946665a0215f 100644
--- a/src/diffusers/schedulers/scheduling_ddpm_flax.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_flax.py
@@ -103,9 +103,9 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
             `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
         clip_sample (`bool`, default `True`):
             option to clip predicted sample between -1 and 1 for numerical stability.
-        predict_epsilon (`bool`):
-            optional flag to use when the model predicts the noise (epsilon), or the samples instead of the noise.
-
+        prediction_type (`str`, default `epsilon`):
+            indicates whether the model predicts the noise (epsilon), or the samples. One of `epsilon`, `sample`.
+            `v-prediction` is not supported for this scheduler.
     """
 
     _compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
@@ -124,8 +124,17 @@ def __init__(
         trained_betas: Optional[jnp.ndarray] = None,
         variance_type: str = "fixed_small",
         clip_sample: bool = True,
-        predict_epsilon: bool = True,
+        prediction_type: str = "epsilon",
+        **kwargs,
     ):
+        message = (
+            "Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
+            " FlaxDDPMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
+        )
+        predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
+        if predict_epsilon is not None:
+            self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
+
         if trained_betas is not None:
             self.betas = jnp.asarray(trained_betas)
         elif beta_schedule == "linear":
@@ -204,7 +213,6 @@ def step(
         timestep: int,
         sample: jnp.ndarray,
         key: random.KeyArray,
-        predict_epsilon: bool = True,
         return_dict: bool = True,
         **kwargs,
     ) -> Union[FlaxDDPMSchedulerOutput, Tuple]:
@@ -227,13 +235,13 @@ def step(
 
         """
         message = (
-            "Please make sure to instantiate your scheduler with `predict_epsilon` instead. E.g. `scheduler ="
-            " DDPMScheduler.from_pretrained(<model_id>, predict_epsilon=True)`."
+            "Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
+            " FlaxDDPMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
         )
         predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
-        if predict_epsilon is not None and predict_epsilon != self.config.predict_epsilon:
+        if predict_epsilon is not None:
             new_config = dict(self.config)
-            new_config["predict_epsilon"] = predict_epsilon
+            new_config["prediction_type"] = "epsilon" if predict_epsilon else "sample"
             self._internal_dict = FrozenDict(new_config)
 
         t = timestep
@@ -251,10 +259,15 @@ def step(
 
         # 2. compute predicted original sample from predicted noise also called
         # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
-        if self.config.predict_epsilon:
+        if self.config.prediction_type == "epsilon":
             pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-        else:
+        elif self.config.prediction_type == "sample":
             pred_original_sample = model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` "
+                " for the FlaxDDPMScheduler."
+            )
 
         # 3. Clip "predicted x_0"
         if self.config.clip_sample:
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
index 472b24637dcf..2999ff7f6a56 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -21,7 +21,7 @@
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS
+from ..utils import _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS, deprecate
 from .scheduling_utils import SchedulerMixin, SchedulerOutput
 
 
@@ -87,10 +87,9 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
         solver_order (`int`, default `2`):
             the order of DPM-Solver; can be `1` or `2` or `3`. We recommend to use `solver_order=2` for guided
             sampling, and `solver_order=3` for unconditional sampling.
-        predict_epsilon (`bool`, default `True`):
-            we currently support both the noise prediction model and the data prediction model. If the model predicts
-            the noise / epsilon, set `predict_epsilon` to `True`. If the model predicts the data / x0 directly, set
-            `predict_epsilon` to `False`.
+        prediction_type (`str`, default `epsilon`):
+            indicates whether the model predicts the noise (epsilon), or the data / `x0`. One of `epsilon`, `sample`.
+            `v-prediction` is not supported for this scheduler.
         thresholding (`bool`, default `False`):
             whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
             For pixel-space diffusion models, you can set both `algorithm_type=dpmsolver++` and `thresholding=True` to
@@ -128,14 +127,23 @@ def __init__(
         beta_schedule: str = "linear",
         trained_betas: Optional[np.ndarray] = None,
         solver_order: int = 2,
-        predict_epsilon: bool = True,
+        prediction_type: str = "epsilon",
         thresholding: bool = False,
         dynamic_thresholding_ratio: float = 0.995,
         sample_max_value: float = 1.0,
         algorithm_type: str = "dpmsolver++",
         solver_type: str = "midpoint",
         lower_order_final: bool = True,
+        **kwargs,
     ):
+        message = (
+            "Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
+            " DPMSolverMultistepScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
+        )
+        predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
+        if predict_epsilon is not None:
+            self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
+
         if trained_betas is not None:
             self.betas = torch.from_numpy(trained_betas)
         elif beta_schedule == "linear":
@@ -221,11 +229,17 @@ def convert_model_output(
         """
         # DPM-Solver++ needs to solve an integral of the data prediction model.
         if self.config.algorithm_type == "dpmsolver++":
-            if self.config.predict_epsilon:
+            if self.config.prediction_type == "epsilon":
                 alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 x0_pred = (sample - sigma_t * model_output) / alpha_t
-            else:
+            elif self.config.prediction_type == "sample":
                 x0_pred = model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` "
+                    " for the DPMSolverMultistepScheduler."
+                )
+
             if self.config.thresholding:
                 # Dynamic thresholding in https://arxiv.org/abs/2205.11487
                 dynamic_max_val = torch.quantile(
@@ -239,12 +253,17 @@ def convert_model_output(
             return x0_pred
         # DPM-Solver needs to solve an integral of the noise prediction model.
         elif self.config.algorithm_type == "dpmsolver":
-            if self.config.predict_epsilon:
+            if self.config.prediction_type == "epsilon":
                 return model_output
-            else:
+            elif self.config.prediction_type == "sample":
                 alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 epsilon = (sample - alpha_t * model_output) / sigma_t
                 return epsilon
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` "
+                    " for the DPMSolverMultistepScheduler."
+                )
 
     def dpm_solver_first_order_update(
         self,
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
index d6fa3835346b..8bb0672fb76a 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
@@ -23,6 +23,7 @@
 import jax.numpy as jnp
 
 from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import deprecate
 from .scheduling_utils_flax import (
     _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS,
     FlaxSchedulerMixin,
@@ -118,10 +119,9 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
         solver_order (`int`, default `2`):
             the order of DPM-Solver; can be `1` or `2` or `3`. We recommend to use `solver_order=2` for guided
             sampling, and `solver_order=3` for unconditional sampling.
-        predict_epsilon (`bool`, default `True`):
-            we currently support both the noise prediction model and the data prediction model. If the model predicts
-            the noise / epsilon, set `predict_epsilon` to `True`. If the model predicts the data / x0 directly, set
-            `predict_epsilon` to `False`.
+        prediction_type (`str`, default `epsilon`):
+            indicates whether the model predicts the noise (epsilon), or the data / `x0`. One of `epsilon`, `sample`.
+            `v-prediction` is not supported for this scheduler.
         thresholding (`bool`, default `False`):
             whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
             For pixel-space diffusion models, you can set both `algorithm_type=dpmsolver++` and `thresholding=True` to
@@ -163,14 +163,23 @@ def __init__(
         beta_schedule: str = "linear",
         trained_betas: Optional[jnp.ndarray] = None,
         solver_order: int = 2,
-        predict_epsilon: bool = True,
+        prediction_type: str = "epsilon",
         thresholding: bool = False,
         dynamic_thresholding_ratio: float = 0.995,
         sample_max_value: float = 1.0,
         algorithm_type: str = "dpmsolver++",
         solver_type: str = "midpoint",
         lower_order_final: bool = True,
+        **kwargs,
     ):
+        message = (
+            "Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
+            " FlaxDPMSolverMultistepScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
+        )
+        predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
+        if predict_epsilon is not None:
+            self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
+
         if trained_betas is not None:
             self.betas = jnp.asarray(trained_betas)
         elif beta_schedule == "linear":
@@ -260,11 +269,17 @@ def convert_model_output(
         """
         # DPM-Solver++ needs to solve an integral of the data prediction model.
         if self.config.algorithm_type == "dpmsolver++":
-            if self.config.predict_epsilon:
+            if self.config.prediction_type == "epsilon":
                 alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 x0_pred = (sample - sigma_t * model_output) / alpha_t
-            else:
+            elif self.config.prediction_type == "sample":
                 x0_pred = model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` "
+                    " for the FlaxDPMSolverMultistepScheduler."
+                )
+
             if self.config.thresholding:
                 # Dynamic thresholding in https://arxiv.org/abs/2205.11487
                 dynamic_max_val = jnp.percentile(
@@ -277,12 +292,17 @@ def convert_model_output(
             return x0_pred
         # DPM-Solver needs to solve an integral of the noise prediction model.
         elif self.config.algorithm_type == "dpmsolver":
-            if self.config.predict_epsilon:
+            if self.config.prediction_type == "epsilon":
                 return model_output
-            else:
+            elif self.config.prediction_type == "sample":
                 alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 epsilon = (sample - alpha_t * model_output) / sigma_t
                 return epsilon
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` "
+                    " for the FlaxDPMSolverMultistepScheduler."
+                )
 
     def dpm_solver_first_order_update(
         self, model_output: jnp.ndarray, timestep: int, prev_timestep: int, sample: jnp.ndarray
diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py
index 332c428c66c6..4b7b2909e7d4 100644
--- a/src/diffusers/schedulers/scheduling_euler_discrete.py
+++ b/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -92,8 +92,6 @@ def __init__(
         else:
             raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
-        self.prediction_type = prediction_type
-
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
 
@@ -232,14 +230,14 @@ def step(
             sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
 
         # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
-        if self.prediction_type == "epsilon":
+        if self.config.prediction_type == "epsilon":
             pred_original_sample = sample - sigma_hat * model_output
-        elif self.prediction_type == "v_prediction":
+        elif self.config.prediction_type == "v_prediction":
             # * c_out + input * c_skip
             pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
         else:
             raise ValueError(
-                f"prediction_type given as {self.prediction_type} must be one of `epsilon`, or `v_prediction`"
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
             )
 
         # 2. Convert to an ODE derivative
diff --git a/tests/pipelines/ddpm/test_ddpm.py b/tests/pipelines/ddpm/test_ddpm.py
index ef293109bf7d..6656fb738d51 100644
--- a/tests/pipelines/ddpm/test_ddpm.py
+++ b/tests/pipelines/ddpm/test_ddpm.py
@@ -68,7 +68,7 @@ def test_inference(self):
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
         assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
 
-    def test_inference_predict_epsilon(self):
+    def test_inference_deprecated_predict_epsilon(self):
         deprecate("remove this test", "0.10.0", "remove")
         unet = self.dummy_uncond_unet
         scheduler = DDPMScheduler(predict_epsilon=False)
@@ -98,6 +98,35 @@ def test_inference_predict_epsilon(self):
         tolerance = 1e-2 if torch_device != "mps" else 3e-2
         assert np.abs(image_slice.flatten() - image_eps_slice.flatten()).max() < tolerance
 
+    def test_inference_predict_sample(self):
+        unet = self.dummy_uncond_unet
+        scheduler = DDPMScheduler(prediction_type="sample")
+
+        ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
+        ddpm.to(torch_device)
+        ddpm.set_progress_bar_config(disable=None)
+
+        # Warmup pass when using mps (see #372)
+        if torch_device == "mps":
+            _ = ddpm(num_inference_steps=1)
+
+        if torch_device == "mps":
+            # device type MPS is not supported for torch.Generator() api.
+            generator = torch.manual_seed(0)
+        else:
+            generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images
+
+        generator = generator.manual_seed(0)
+        image_eps = ddpm(generator=generator, num_inference_steps=2, output_type="numpy")[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_eps_slice = image_eps[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        tolerance = 1e-2 if torch_device != "mps" else 3e-2
+        assert np.abs(image_slice.flatten() - image_eps_slice.flatten()).max() < tolerance
+
 
 @slow
 @require_torch_gpu
diff --git a/tests/test_config.py b/tests/test_config.py
index 0875930e37cc..2a021c4ced5f 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -26,6 +26,7 @@
     logging,
 )
 from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import deprecate
 from diffusers.utils.testing_utils import CaptureLogger
 
 
@@ -194,17 +195,27 @@ def test_overwrite_config_on_load(self):
             ddpm = DDPMScheduler.from_pretrained(
                 "hf-internal-testing/tiny-stable-diffusion-torch",
                 subfolder="scheduler",
-                predict_epsilon=False,
+                prediction_type="sample",
                 beta_end=8,
             )
 
         with CaptureLogger(logger) as cap_logger_2:
             ddpm_2 = DDPMScheduler.from_pretrained("google/ddpm-celebahq-256", beta_start=88)
 
+        with CaptureLogger(logger) as cap_logger:
+            deprecate("remove this case", "0.10.0", "remove")
+            ddpm_3 = DDPMScheduler.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-torch",
+                subfolder="scheduler",
+                predict_epsilon=False,
+                beta_end=8,
+            )
+
         assert ddpm.__class__ == DDPMScheduler
-        assert ddpm.config.predict_epsilon is False
+        assert ddpm.config.prediction_type == "sample"
         assert ddpm.config.beta_end == 8
         assert ddpm_2.config.beta_start == 88
+        assert ddpm_3.config.prediction_type == "sample"
 
         # no warning should be thrown
         assert cap_logger.out == ""
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index a1bee2969687..0aad9de8be02 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -20,7 +20,6 @@
 import shutil
 import tempfile
 import unittest
-from functools import partial
 
 import numpy as np
 import torch
@@ -332,14 +331,13 @@ def to(self, device):
     @parameterized.expand(
         [
             [DDIMScheduler, DDIMPipeline, 32],
-            [partial(DDPMScheduler, predict_epsilon=True), DDPMPipeline, 32],
+            [DDPMScheduler, DDPMPipeline, 32],
             [DDIMScheduler, DDIMPipeline, (32, 64)],
-            [partial(DDPMScheduler, predict_epsilon=True), DDPMPipeline, (64, 32)],
+            [DDPMScheduler, DDPMPipeline, (64, 32)],
         ]
     )
     def test_uncond_unet_components(self, scheduler_fn=DDPMScheduler, pipeline_fn=DDPMPipeline, sample_size=32):
         unet = self.dummy_uncond_unet(sample_size)
-        # DDIM doesn't take `predict_epsilon`, and DDPM requires it -- so using partial in parameterized decorator
         scheduler = scheduler_fn()
         pipeline = pipeline_fn(unet, scheduler).to(torch_device)
 
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 9c9abd09732b..4406149d860c 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -599,7 +599,12 @@ def test_clip_sample(self):
         for clip_sample in [True, False]:
             self.check_over_configs(clip_sample=clip_sample)
 
-    def test_predict_epsilon(self):
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "sample"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_deprecated_predict_epsilon(self):
+        deprecate("remove this test", "0.10.0", "remove")
         for predict_epsilon in [True, False]:
             self.check_over_configs(predict_epsilon=predict_epsilon)
 
@@ -795,7 +800,7 @@ def get_scheduler_config(self, **kwargs):
             "beta_end": 0.02,
             "beta_schedule": "linear",
             "solver_order": 2,
-            "predict_epsilon": True,
+            "prediction_type": "epsilon",
             "thresholding": False,
             "sample_max_value": 1.0,
             "algorithm_type": "dpmsolver++",
@@ -921,10 +926,10 @@ def test_thresholding(self):
         for order in [1, 2, 3]:
             for solver_type in ["midpoint", "heun"]:
                 for threshold in [0.5, 1.0, 2.0]:
-                    for predict_epsilon in [True, False]:
+                    for prediction_type in ["epsilon", "sample"]:
                         self.check_over_configs(
                             thresholding=True,
-                            predict_epsilon=predict_epsilon,
+                            prediction_type=prediction_type,
                             sample_max_value=threshold,
                             algorithm_type="dpmsolver++",
                             solver_order=order,
@@ -935,17 +940,17 @@ def test_solver_order_and_type(self):
         for algorithm_type in ["dpmsolver", "dpmsolver++"]:
             for solver_type in ["midpoint", "heun"]:
                 for order in [1, 2, 3]:
-                    for predict_epsilon in [True, False]:
+                    for prediction_type in ["epsilon", "sample"]:
                         self.check_over_configs(
                             solver_order=order,
                             solver_type=solver_type,
-                            predict_epsilon=predict_epsilon,
+                            prediction_type=prediction_type,
                             algorithm_type=algorithm_type,
                         )
                         sample = self.full_loop(
                             solver_order=order,
                             solver_type=solver_type,
-                            predict_epsilon=predict_epsilon,
+                            prediction_type=prediction_type,
                             algorithm_type=algorithm_type,
                         )
                         assert not torch.isnan(sample).any(), "Samples have nan numbers"
diff --git a/tests/test_scheduler_flax.py b/tests/test_scheduler_flax.py
index 0fa0e1b495bb..6524e18d23a8 100644
--- a/tests/test_scheduler_flax.py
+++ b/tests/test_scheduler_flax.py
@@ -17,7 +17,7 @@
 from typing import Dict, List, Tuple
 
 from diffusers import FlaxDDIMScheduler, FlaxDDPMScheduler, FlaxPNDMScheduler
-from diffusers.utils import is_flax_available
+from diffusers.utils import deprecate, is_flax_available
 from diffusers.utils.testing_utils import require_flax
 
 
@@ -599,6 +599,26 @@ def test_full_loop_with_no_set_alpha_to_one(self):
             assert abs(result_sum - 149.0784) < 1e-2
             assert abs(result_mean - 0.1941) < 1e-3
 
+    def test_prediction_type(self):
+        for prediction_type in ["epsilon", "sample", "v_prediction"]:
+            self.check_over_configs(prediction_type=prediction_type)
+
+    def test_deprecated_predict_epsilon(self):
+        deprecate("remove this test", "0.10.0", "remove")
+        for predict_epsilon in [True, False]:
+            self.check_over_configs(predict_epsilon=predict_epsilon)
+
+    def test_deprecated_predict_epsilon_to_prediction_type(self):
+        deprecate("remove this test", "0.10.0", "remove")
+        for scheduler_class in self.scheduler_classes:
+            scheduler_config = self.get_scheduler_config(predict_epsilon=True)
+            scheduler = scheduler_class.from_config(scheduler_config)
+            assert scheduler.prediction_type == "epsilon"
+
+            scheduler_config = self.get_scheduler_config(predict_epsilon=False)
+            scheduler = scheduler_class.from_config(scheduler_config)
+            assert scheduler.prediction_type == "sample"
+
 
 @require_flax
 class FlaxPNDMSchedulerTest(FlaxSchedulerCommonTest):

From 86aa747da9c99134f6527e4562014cbdd7ebaa72 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Fri, 25 Nov 2022 14:51:17 +0100
Subject: [PATCH 259/269] Fix ONNX conversion and inference (#1416)

---
 ...ert_stable_diffusion_checkpoint_to_onnx.py |  5 ++-
 .../pipeline_onnx_stable_diffusion.py         | 40 +++--------------
 .../pipeline_onnx_stable_diffusion_img2img.py | 24 +----------
 .../pipeline_onnx_stable_diffusion_inpaint.py | 43 +++----------------
 4 files changed, 18 insertions(+), 94 deletions(-)

diff --git a/scripts/convert_stable_diffusion_checkpoint_to_onnx.py b/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
index f0e0b178af20..26d3d5618f88 100644
--- a/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
+++ b/scripts/convert_stable_diffusion_checkpoint_to_onnx.py
@@ -215,8 +215,10 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F
         )
         del pipeline.safety_checker
         safety_checker = OnnxRuntimeModel.from_pretrained(output_path / "safety_checker")
+        feature_extractor = pipeline.feature_extractor
     else:
         safety_checker = None
+        feature_extractor = None
 
     onnx_pipeline = OnnxStableDiffusionPipeline(
         vae_encoder=OnnxRuntimeModel.from_pretrained(output_path / "vae_encoder"),
@@ -226,7 +228,8 @@ def convert_models(model_path: str, output_path: str, opset: int, fp16: bool = F
         unet=OnnxRuntimeModel.from_pretrained(output_path / "unet"),
         scheduler=pipeline.scheduler,
         safety_checker=safety_checker,
-        feature_extractor=pipeline.feature_extractor,
+        feature_extractor=feature_extractor,
+        requires_safety_checker=safety_checker is not None,
     )
 
     onnx_pipeline.save_pretrained(output_path)
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
index 3caab834befd..6cb2c8ba87c7 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
@@ -18,7 +18,6 @@
 import numpy as np
 import torch
 
-from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -42,6 +41,8 @@ class OnnxStableDiffusionPipeline(DiffusionPipeline):
     safety_checker: OnnxRuntimeModel
     feature_extractor: CLIPFeatureExtractor
 
+    _optional_components = ["safety_checker", "feature_extractor"]
+
     def __init__(
         self,
         vae_encoder: OnnxRuntimeModel,
@@ -99,27 +100,6 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
-            version.parse(unet.config._diffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
         self.register_modules(
             vae_encoder=vae_encoder,
             vae_decoder=vae_decoder,
@@ -130,7 +110,6 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
@@ -213,8 +192,8 @@ def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guida
     def __call__(
         self,
         prompt: Union[str, List[str]],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
         num_inference_steps: Optional[int] = 50,
         guidance_scale: Optional[float] = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -228,10 +207,6 @@ def __call__(
         callback_steps: Optional[int] = 1,
         **kwargs,
     ):
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
-
         if isinstance(prompt, str):
             batch_size = 1
         elif isinstance(prompt, list):
@@ -264,12 +239,7 @@ def __call__(
 
         # get the initial random noise unless the user supplied it
         latents_dtype = text_embeddings.dtype
-        latents_shape = (
-            batch_size * num_images_per_prompt,
-            4,
-            height // self.vae_scale_factor,
-            width // self.vae_scale_factor,
-        )
+        latents_shape = (batch_size * num_images_per_prompt, 4, height // 8, width // 8)
         if latents is None:
             latents = generator.randn(*latents_shape).astype(latents_dtype)
         elif latents.shape != latents_shape:
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
index 4d42201676c9..949ef94b3a5a 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -19,7 +19,6 @@
 import torch
 
 import PIL
-from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -78,6 +77,8 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
     safety_checker: OnnxRuntimeModel
     feature_extractor: CLIPFeatureExtractor
 
+    _optional_components = ["safety_checker", "feature_extractor"]
+
     def __init__(
         self,
         vae_encoder: OnnxRuntimeModel,
@@ -135,27 +136,6 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
-            version.parse(unet.config._diffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
         self.register_modules(
             vae_encoder=vae_encoder,
             vae_decoder=vae_decoder,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
index 863f7b7aaea8..0a8f7a5fc580 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -19,7 +19,6 @@
 import torch
 
 import PIL
-from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 
 from ...configuration_utils import FrozenDict
@@ -91,6 +90,8 @@ class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline):
     safety_checker: OnnxRuntimeModel
     feature_extractor: CLIPFeatureExtractor
 
+    _optional_components = ["safety_checker", "feature_extractor"]
+
     def __init__(
         self,
         vae_encoder: OnnxRuntimeModel,
@@ -149,27 +150,6 @@ def __init__(
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
 
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
-            version.parse(unet.config._diffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
         self.register_modules(
             vae_encoder=vae_encoder,
             vae_decoder=vae_decoder,
@@ -180,7 +160,6 @@ def __init__(
             safety_checker=safety_checker,
             feature_extractor=feature_extractor,
         )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         self.register_to_config(requires_safety_checker=requires_safety_checker)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline._encode_prompt
@@ -267,8 +246,8 @@ def __call__(
         prompt: Union[str, List[str]],
         image: PIL.Image.Image,
         mask_image: PIL.Image.Image,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -296,9 +275,9 @@ def __call__(
                 repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
                 to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
                 instead of 3, so the expected shape would be `(B, H, W, 1)`.
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+            height (`int`, *optional*, defaults to 512):
                 The height in pixels of the generated image.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+            width (`int`, *optional*, defaults to 512):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -343,9 +322,6 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
-        # 0. Default height and width to unet
-        height = height or self.unet.config.sample_size * self.vae_scale_factor
-        width = width or self.unet.config.sample_size * self.vae_scale_factor
 
         if isinstance(prompt, str):
             batch_size = 1
@@ -381,12 +357,7 @@ def __call__(
         )
 
         num_channels_latents = NUM_LATENT_CHANNELS
-        latents_shape = (
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height // self.vae_scale_factor,
-            width // self.vae_scale_factor,
-        )
+        latents_shape = (batch_size * num_images_per_prompt, num_channels_latents, height // 8, width // 8)
         latents_dtype = text_embeddings.dtype
         if latents is None:
             latents = generator.randn(*latents_shape).astype(latents_dtype)

From 8faa822ddc6e214498fc1a6d6e7a48ed31d2fb91 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 25 Nov 2022 15:07:09 +0100
Subject: [PATCH 260/269] Allow to set config params directly in init (#1419)

* fix

* fix deprecated kwargs logic

* add tests

* finish
---
 src/diffusers/configuration_utils.py          | 20 ++++++++++-------
 src/diffusers/models/unet_2d_blocks.py        |  2 --
 .../versatile_diffusion/modeling_text_unet.py |  1 -
 src/diffusers/schedulers/scheduling_ddim.py   |  1 +
 .../schedulers/scheduling_ddim_flax.py        |  1 +
 src/diffusers/schedulers/scheduling_ddpm.py   |  1 +
 .../schedulers/scheduling_ddpm_flax.py        |  1 +
 .../scheduling_dpmsolver_multistep.py         |  1 +
 .../scheduling_dpmsolver_multistep_flax.py    |  1 +
 tests/test_modeling_common.py                 | 20 +++++++++++++++++
 tests/test_modeling_common_flax.py            | 22 +++++++++++++++++++
 tests/test_scheduler.py                       | 21 ++++++++++++++++++
 tests/test_scheduler_flax.py                  | 22 +++++++++++++++++++
 13 files changed, 103 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py
index 1a7499c6118d..f06586b23698 100644
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -80,14 +80,18 @@ class ConfigMixin:
         - **config_name** (`str`) -- A filename under which the config should stored when calling
           [`~ConfigMixin.save_config`] (should be overridden by parent class).
         - **ignore_for_config** (`List[str]`) -- A list of attributes that should not be saved in the config (should be
-          overridden by parent class).
-        - **has_compatibles** (`bool`) -- Whether the class has compatible classes (should be overridden by parent
-          class).
+          overridden by subclass).
+        - **has_compatibles** (`bool`) -- Whether the class has compatible classes (should be overridden by subclass).
+        - **_deprecated_kwargs** (`List[str]`) -- Keyword arguments that are deprecated. Note that the init function
+          should only have a `kwargs` argument if at least one argument is deprecated (should be overridden by
+          subclass).
     """
     config_name = None
     ignore_for_config = []
     has_compatibles = False
 
+    _deprecated_kwargs = []
+
     def register_to_config(self, **kwargs):
         if self.config_name is None:
             raise NotImplementedError(f"Make sure that {self.__class__} has defined a class name `config_name`")
@@ -195,10 +199,10 @@ def from_config(cls, config: Union[FrozenDict, Dict[str, Any]] = None, return_un
         if "dtype" in unused_kwargs:
             init_dict["dtype"] = unused_kwargs.pop("dtype")
 
-        if "predict_epsilon" in unused_kwargs and "prediction_type" not in init_dict:
-            deprecate("remove this", "0.10.0", "remove")
-            predict_epsilon = unused_kwargs.pop("predict_epsilon")
-            init_dict["prediction_type"] = "epsilon" if predict_epsilon else "sample"
+        # add possible deprecated kwargs
+        for deprecated_kwarg in cls._deprecated_kwargs:
+            if deprecated_kwarg in unused_kwargs:
+                init_dict[deprecated_kwarg] = unused_kwargs.pop(deprecated_kwarg)
 
         # Return model and optionally state and/or unused_kwargs
         model = cls(**init_dict)
@@ -526,7 +530,6 @@ def inner_init(self, *args, **kwargs):
         # Ignore private kwargs in the init.
         init_kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")}
         config_init_kwargs = {k: v for k, v in kwargs.items() if k.startswith("_")}
-        init(self, *args, **init_kwargs)
         if not isinstance(self, ConfigMixin):
             raise RuntimeError(
                 f"`@register_for_config` was applied to {self.__class__.__name__} init method, but this class does "
@@ -553,6 +556,7 @@ def inner_init(self, *args, **kwargs):
         )
         new_kwargs = {**config_init_kwargs, **new_kwargs}
         getattr(self, "register_to_config")(**new_kwargs)
+        init(self, *args, **init_kwargs)
 
     return inner_init
 
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
index 6b4a88c0ae3d..cce7e7fd5a90 100644
--- a/src/diffusers/models/unet_2d_blocks.py
+++ b/src/diffusers/models/unet_2d_blocks.py
@@ -254,7 +254,6 @@ def __init__(
         attn_num_head_channels=1,
         attention_type="default",
         output_scale_factor=1.0,
-        **kwargs,
     ):
         super().__init__()
 
@@ -336,7 +335,6 @@ def __init__(
         cross_attention_dim=1280,
         dual_cross_attention=False,
         use_linear_projection=False,
-        **kwargs,
     ):
         super().__init__()
 
diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
index fb8855b95f8c..37a79b5c1be7 100644
--- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
+++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py
@@ -1039,7 +1039,6 @@ def __init__(
         cross_attention_dim=1280,
         dual_cross_attention=False,
         use_linear_projection=False,
-        **kwargs,
     ):
         super().__init__()
 
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index b16716f0e676..3640b37546ec 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -113,6 +113,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
     """
 
     _compatibles = _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
+    _deprecated_kwargs = ["predict_epsilon"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/schedulers/scheduling_ddim_flax.py b/src/diffusers/schedulers/scheduling_ddim_flax.py
index 122c36f291da..f98d9770043f 100644
--- a/src/diffusers/schedulers/scheduling_ddim_flax.py
+++ b/src/diffusers/schedulers/scheduling_ddim_flax.py
@@ -116,6 +116,7 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
     """
 
     _compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
+    _deprecated_kwargs = ["predict_epsilon"]
 
     @property
     def has_state(self):
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index c691630a2b86..6f131659c2ba 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -105,6 +105,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
     """
 
     _compatibles = _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
+    _deprecated_kwargs = ["predict_epsilon"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/schedulers/scheduling_ddpm_flax.py b/src/diffusers/schedulers/scheduling_ddpm_flax.py
index 946665a0215f..97b38fd3a17e 100644
--- a/src/diffusers/schedulers/scheduling_ddpm_flax.py
+++ b/src/diffusers/schedulers/scheduling_ddpm_flax.py
@@ -109,6 +109,7 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
     """
 
     _compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
+    _deprecated_kwargs = ["predict_epsilon"]
 
     @property
     def has_state(self):
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
index 2999ff7f6a56..d38ceed281ed 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -117,6 +117,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
     """
 
     _compatibles = _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
+    _deprecated_kwargs = ["predict_epsilon"]
 
     @register_to_config
     def __init__(
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
index 8bb0672fb76a..4d56d99a8cd8 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
@@ -149,6 +149,7 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
     """
 
     _compatibles = _FLAX_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS.copy()
+    _deprecated_kwargs = ["predict_epsilon"]
 
     @property
     def has_state(self):
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 49bb4f6deb33..cad1887f4df8 100644
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -265,3 +265,23 @@ def test_enable_disable_gradient_checkpointing(self):
         # check disable works
         model.disable_gradient_checkpointing()
         self.assertFalse(model.is_gradient_checkpointing)
+
+    def test_deprecated_kwargs(self):
+        has_kwarg_in_model_class = "kwargs" in inspect.signature(self.model_class.__init__).parameters
+        has_deprecated_kwarg = len(self.model_class._deprecated_kwargs) > 0
+
+        if has_kwarg_in_model_class and not has_deprecated_kwarg:
+            raise ValueError(
+                f"{self.model_class} has `**kwargs` in its __init__ method but has not defined any deprecated kwargs"
+                " under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if there are"
+                " no deprecated arguments or add the deprecated argument with `_deprecated_kwargs ="
+                " [<deprecated_argument>]`"
+            )
+
+        if not has_kwarg_in_model_class and has_deprecated_kwarg:
+            raise ValueError(
+                f"{self.model_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated kwargs"
+                " under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs` argument to"
+                f" {self.model_class}.__init__ if there are deprecated arguments or remove the deprecated argument"
+                " from `_deprecated_kwargs = [<deprecated_argument>]`"
+            )
diff --git a/tests/test_modeling_common_flax.py b/tests/test_modeling_common_flax.py
index 61849b22318f..8945aed7c93f 100644
--- a/tests/test_modeling_common_flax.py
+++ b/tests/test_modeling_common_flax.py
@@ -1,3 +1,5 @@
+import inspect
+
 from diffusers.utils import is_flax_available
 from diffusers.utils.testing_utils import require_flax
 
@@ -42,3 +44,23 @@ def test_forward_with_norm_groups(self):
         self.assertIsNotNone(output)
         expected_shape = inputs_dict["sample"].shape
         self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    def test_deprecated_kwargs(self):
+        has_kwarg_in_model_class = "kwargs" in inspect.signature(self.model_class.__init__).parameters
+        has_deprecated_kwarg = len(self.model_class._deprecated_kwargs) > 0
+
+        if has_kwarg_in_model_class and not has_deprecated_kwarg:
+            raise ValueError(
+                f"{self.model_class} has `**kwargs` in its __init__ method but has not defined any deprecated kwargs"
+                " under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if there are"
+                " no deprecated arguments or add the deprecated argument with `_deprecated_kwargs ="
+                " [<deprecated_argument>]`"
+            )
+
+        if not has_kwarg_in_model_class and has_deprecated_kwarg:
+            raise ValueError(
+                f"{self.model_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated kwargs"
+                " under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs` argument to"
+                f" {self.model_class}.__init__ if there are deprecated arguments or remove the deprecated argument"
+                " from `_deprecated_kwargs = [<deprecated_argument>]`"
+            )
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index 4406149d860c..6a76581632ad 100755
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -562,6 +562,27 @@ def test_add_noise_device(self):
             noised = scheduler.add_noise(scaled_sample, noise, t)
             self.assertEqual(noised.shape, scaled_sample.shape)
 
+    def test_deprecated_kwargs(self):
+        for scheduler_class in self.scheduler_classes:
+            has_kwarg_in_model_class = "kwargs" in inspect.signature(scheduler_class.__init__).parameters
+            has_deprecated_kwarg = len(scheduler_class._deprecated_kwargs) > 0
+
+            if has_kwarg_in_model_class and not has_deprecated_kwarg:
+                raise ValueError(
+                    f"{scheduler_class} has `**kwargs` in its __init__ method but has not defined any deprecated"
+                    " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if"
+                    " there are no deprecated arguments or add the deprecated argument with `_deprecated_kwargs ="
+                    " [<deprecated_argument>]`"
+                )
+
+            if not has_kwarg_in_model_class and has_deprecated_kwarg:
+                raise ValueError(
+                    f"{scheduler_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated"
+                    " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs`"
+                    f" argument to {self.model_class}.__init__ if there are deprecated arguments or remove the"
+                    " deprecated argument from `_deprecated_kwargs = [<deprecated_argument>]`"
+                )
+
 
 class DDPMSchedulerTest(SchedulerCommonTest):
     scheduler_classes = (DDPMScheduler,)
diff --git a/tests/test_scheduler_flax.py b/tests/test_scheduler_flax.py
index 6524e18d23a8..5ada689b724d 100644
--- a/tests/test_scheduler_flax.py
+++ b/tests/test_scheduler_flax.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import inspect
 import tempfile
 import unittest
 from typing import Dict, List, Tuple
@@ -228,6 +229,27 @@ def recursive_check(tuple_object, dict_object):
 
             recursive_check(outputs_tuple[0], outputs_dict.prev_sample)
 
+    def test_deprecated_kwargs(self):
+        for scheduler_class in self.scheduler_classes:
+            has_kwarg_in_model_class = "kwargs" in inspect.signature(scheduler_class.__init__).parameters
+            has_deprecated_kwarg = len(scheduler_class._deprecated_kwargs) > 0
+
+            if has_kwarg_in_model_class and not has_deprecated_kwarg:
+                raise ValueError(
+                    f"{scheduler_class} has `**kwargs` in its __init__ method but has not defined any deprecated"
+                    " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if"
+                    " there are no deprecated arguments or add the deprecated argument with `_deprecated_kwargs ="
+                    " [<deprecated_argument>]`"
+                )
+
+            if not has_kwarg_in_model_class and has_deprecated_kwarg:
+                raise ValueError(
+                    f"{scheduler_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated"
+                    " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs`"
+                    f" argument to {self.model_class}.__init__ if there are deprecated arguments or remove the"
+                    " deprecated argument from `_deprecated_kwargs = [<deprecated_argument>]`"
+                )
+
 
 @require_flax
 class FlaxDDPMSchedulerTest(FlaxSchedulerCommonTest):

From 02aa4ef12e2ce0848a8bf5e36be667782f158a05 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Fri, 25 Nov 2022 15:14:13 +0100
Subject: [PATCH 261/269] Add tests for Stable Diffusion 2 V-prediction 768x768
 (#1420)

---
 .../test_stable_diffusion.py                  |  49 +-
 .../test_stable_diffusion_v_pred.py           | 474 ++++++++++++++++++
 2 files changed, 495 insertions(+), 28 deletions(-)
 create mode 100644 tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py

diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
index e1d22662cd83..dcd4f6711dff 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
@@ -34,7 +34,7 @@
 )
 from diffusers.utils import load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
-from transformers import CLIPFeatureExtractor, CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 
 from ...test_pipelines_common import PipelineTesterMixin
 
@@ -100,21 +100,6 @@ def dummy_text_encoder(self):
         )
         return CLIPTextModel(config)
 
-    @property
-    def dummy_extractor(self):
-        def extract(*args, **kwargs):
-            class Out:
-                def __init__(self):
-                    self.pixel_values = torch.ones([0])
-
-                def to(self, device):
-                    self.pixel_values.to(device)
-                    return self
-
-            return Out()
-
-        return extract
-
     def test_save_pretrained_from_pretrained(self):
         device = "cpu"  # ensure determinism for the device-dependent torch.Generator
         unet = self.dummy_cond_unet
@@ -129,7 +114,6 @@ def test_save_pretrained_from_pretrained(self):
         vae = self.dummy_vae
         bert = self.dummy_text_encoder
         tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        feature_extractor = CLIPFeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-clip")
 
         # make sure here that pndm scheduler skips prk
         sd_pipe = StableDiffusionPipeline(
@@ -139,7 +123,8 @@ def test_save_pretrained_from_pretrained(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=feature_extractor,
+            feature_extractor=None,
+            requires_safety_checker=False,
         )
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -185,7 +170,8 @@ def test_stable_diffusion_ddim(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor,
+            feature_extractor=None,
+            requires_safety_checker=False,
         )
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -231,7 +217,8 @@ def test_stable_diffusion_pndm(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor,
+            feature_extractor=None,
+            requires_safety_checker=False,
         )
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -276,7 +263,8 @@ def test_stable_diffusion_k_lms(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor,
+            feature_extractor=None,
+            requires_safety_checker=False,
         )
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -321,7 +309,8 @@ def test_stable_diffusion_k_euler_ancestral(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor,
+            feature_extractor=None,
+            requires_safety_checker=False,
         )
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -366,7 +355,8 @@ def test_stable_diffusion_k_euler(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor,
+            feature_extractor=None,
+            requires_safety_checker=False,
         )
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -411,7 +401,8 @@ def test_stable_diffusion_attention_chunk(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor,
+            feature_extractor=None,
+            requires_safety_checker=False,
         )
         sd_pipe = sd_pipe.to(device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -449,7 +440,8 @@ def test_stable_diffusion_fp16(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor,
+            feature_extractor=None,
+            requires_safety_checker=False,
         )
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -475,7 +467,8 @@ def test_stable_diffusion_long_prompt(self):
             text_encoder=bert,
             tokenizer=tokenizer,
             safety_checker=None,
-            feature_extractor=self.dummy_extractor,
+            feature_extractor=None,
+            requires_safety_checker=False,
         )
         sd_pipe = sd_pipe.to(torch_device)
         sd_pipe.set_progress_bar_config(disable=None)
@@ -572,7 +565,7 @@ def test_stable_diffusion_k_lms(self):
         expected_slice = np.array([0.0548, 0.0626, 0.0612, 0.0611, 0.0706, 0.0586, 0.0843, 0.0333, 0.1197])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
 
-    def test_stable_diffusion_memory_chunking(self):
+    def test_stable_diffusion_attention_slicing(self):
         torch.cuda.reset_peak_memory_stats()
         model_id = "stabilityai/stable-diffusion-2-base"
         pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16)
@@ -651,7 +644,7 @@ def test_stable_diffusion_text2img_pipeline_default(self):
         prompt = "astronaut riding a horse"
 
         generator = torch.Generator(device=torch_device).manual_seed(0)
-        output = pipe(prompt=prompt, strength=0.75, guidance_scale=7.5, generator=generator, output_type="np")
+        output = pipe(prompt=prompt, guidance_scale=7.5, generator=generator, output_type="np")
         image = output.images[0]
 
         assert image.shape == (512, 512, 3)
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
new file mode 100644
index 000000000000..cfc450db4a86
--- /dev/null
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
@@ -0,0 +1,474 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import time
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerDiscreteScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils import load_numpy, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class StableDiffusion2VPredictionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @property
+    def dummy_cond_unet(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            # SD2-specific config below
+            attention_head_dim=(2, 4, 8, 8),
+            use_linear_projection=True,
+        )
+        return model
+
+    @property
+    def dummy_vae(self):
+        torch.manual_seed(0)
+        model = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+            sample_size=128,
+        )
+        return model
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=64,
+        )
+        return CLIPTextModel(config)
+
+    def test_stable_diffusion_v_pred_ddim(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+            prediction_type="v_prediction",
+        )
+
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=None,
+            requires_safety_checker=False,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.6424, 0.6109, 0.494, 0.5088, 0.4984, 0.4525, 0.5059, 0.5068, 0.4474])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_v_pred_k_euler(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet
+        scheduler = EulerDiscreteScheduler(
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", prediction_type="v_prediction"
+        )
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=None,
+            requires_safety_checker=False,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            generator=generator,
+            guidance_scale=6.0,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.4616, 0.5184, 0.4887, 0.5111, 0.4839, 0.48, 0.5119, 0.5263, 0.4776])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
+    def test_stable_diffusion_v_pred_fp16(self):
+        """Test that stable diffusion v-prediction works with fp16"""
+        unet = self.dummy_cond_unet
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+            prediction_type="v_prediction",
+        )
+        vae = self.dummy_vae
+        bert = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        # put models in fp16
+        unet = unet.half()
+        vae = vae.half()
+        bert = bert.half()
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=bert,
+            tokenizer=tokenizer,
+            safety_checker=None,
+            feature_extractor=None,
+            requires_safety_checker=False,
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images
+
+        assert image.shape == (1, 64, 64, 3)
+
+
+@slow
+@require_torch_gpu
+class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_diffusion_v_pred_default(self):
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.enable_attention_slicing()
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = sd_pipe([prompt], generator=generator, guidance_scale=7.5, num_inference_steps=20, output_type="np")
+
+        image = output.images
+        image_slice = image[0, 253:256, 253:256, -1]
+
+        assert image.shape == (1, 768, 768, 3)
+        expected_slice = np.array([0.0567, 0.057, 0.0416, 0.0463, 0.0433, 0.06, 0.0517, 0.0526, 0.0866])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_v_pred_euler(self):
+        scheduler = EulerDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-2", subfolder="scheduler")
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=scheduler)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.enable_attention_slicing()
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+
+        output = sd_pipe([prompt], generator=generator, num_inference_steps=5, output_type="numpy")
+        image = output.images
+
+        image_slice = image[0, 253:256, 253:256, -1]
+
+        assert image.shape == (1, 768, 768, 3)
+        expected_slice = np.array([0.0351, 0.0376, 0.0505, 0.0424, 0.0551, 0.0656, 0.0471, 0.0276, 0.0596])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_v_pred_dpm(self):
+        """
+        TODO: update this test after making DPM compatible with V-prediction!
+        """
+        scheduler = DPMSolverMultistepScheduler.from_pretrained(
+            "stabilityai/stable-diffusion-2", subfolder="scheduler"
+        )
+        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=scheduler)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.enable_attention_slicing()
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "a photograph of an astronaut riding a horse"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = sd_pipe(
+            [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=5, output_type="numpy"
+        ).images
+
+        image_slice = image[0, 253:256, 253:256, -1]
+        assert image.shape == (1, 768, 768, 3)
+        expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    def test_stable_diffusion_attention_slicing_v_pred(self):
+        torch.cuda.reset_peak_memory_stats()
+        model_id = "stabilityai/stable-diffusion-2"
+        pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "a photograph of an astronaut riding a horse"
+
+        # make attention efficient
+        pipe.enable_attention_slicing()
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        with torch.autocast(torch_device):
+            output_chunked = pipe(
+                [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
+            )
+            image_chunked = output_chunked.images
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+        # make sure that less than 5.5 GB is allocated
+        assert mem_bytes < 5.5 * 10**9
+
+        # disable slicing
+        pipe.disable_attention_slicing()
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        with torch.autocast(torch_device):
+            output = pipe(
+                [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
+            )
+            image = output.images
+
+        # make sure that more than 5.5 GB is allocated
+        mem_bytes = torch.cuda.max_memory_allocated()
+        assert mem_bytes > 5.5 * 10**9
+        assert np.abs(image_chunked.flatten() - image.flatten()).max() < 1e-3
+
+    def test_stable_diffusion_text2img_pipeline_v_pred_default(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/"
+            "sd2-text2img/astronaut_riding_a_horse_v_pred.npy"
+        )
+
+        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
+        pipe.to(torch_device)
+        pipe.enable_attention_slicing()
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "astronaut riding a horse"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(prompt=prompt, guidance_scale=7.5, generator=generator, output_type="np")
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+        assert np.abs(expected_image - image).max() < 5e-3
+
+    def test_stable_diffusion_text2img_pipeline_v_pred_fp16(self):
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/"
+            "sd2-text2img/astronaut_riding_a_horse_v_pred_fp16.npy"
+        )
+
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2", revision="fp16", torch_dtype=torch.float16
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "astronaut riding a horse"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(prompt=prompt, guidance_scale=7.5, generator=generator, output_type="np")
+        image = output.images[0]
+
+        assert image.shape == (768, 768, 3)
+        assert np.abs(expected_image - image).max() < 5e-3
+
+    def test_stable_diffusion_text2img_intermediate_state_v_pred(self):
+        number_of_steps = 0
+
+        def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
+            test_callback_fn.has_been_called = True
+            nonlocal number_of_steps
+            number_of_steps += 1
+            if step == 0:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 96, 96)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array(
+                    [-0.2543, -1.2755, 0.4261, -0.9555, -1.173, -0.5892, 2.4159, 0.1554, -1.2098]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-3
+            elif step == 19:
+                latents = latents.detach().cpu().numpy()
+                assert latents.shape == (1, 4, 96, 96)
+                latents_slice = latents[0, -3:, -3:, -1]
+                expected_slice = np.array(
+                    [-0.9572, -0.967, -0.6152, 0.0894, -0.699, -0.2344, 1.5465, -0.0357, -0.1141]
+                )
+                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-2
+
+        test_callback_fn.has_been_called = False
+
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2", revision="fp16", torch_dtype=torch.float16
+        )
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "Andromeda galaxy in a bottle"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        with torch.autocast(torch_device):
+            pipe(
+                prompt=prompt,
+                num_inference_steps=20,
+                guidance_scale=7.5,
+                generator=generator,
+                callback=test_callback_fn,
+                callback_steps=1,
+            )
+        assert test_callback_fn.has_been_called
+        assert number_of_steps == 20
+
+    def test_stable_diffusion_low_cpu_mem_usage_v_pred(self):
+        pipeline_id = "stabilityai/stable-diffusion-2"
+
+        start_time = time.time()
+        pipeline_low_cpu_mem_usage = StableDiffusionPipeline.from_pretrained(
+            pipeline_id, revision="fp16", torch_dtype=torch.float16
+        )
+        pipeline_low_cpu_mem_usage.to(torch_device)
+        low_cpu_mem_usage_time = time.time() - start_time
+
+        start_time = time.time()
+        _ = StableDiffusionPipeline.from_pretrained(
+            pipeline_id, revision="fp16", torch_dtype=torch.float16, low_cpu_mem_usage=False
+        )
+        normal_load_time = time.time() - start_time
+
+        assert 2 * low_cpu_mem_usage_time < normal_load_time
+
+    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading_v_pred(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        pipeline_id = "stabilityai/stable-diffusion-2"
+        prompt = "Andromeda galaxy in a bottle"
+
+        pipeline = StableDiffusionPipeline.from_pretrained(pipeline_id, revision="fp16", torch_dtype=torch.float16)
+        pipeline = pipeline.to(torch_device)
+        pipeline.enable_attention_slicing(1)
+        pipeline.enable_sequential_cpu_offload()
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        _ = pipeline(prompt, generator=generator, num_inference_steps=5)
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 2.8 GB is allocated
+        assert mem_bytes < 2.8 * 10**9

From 9ec5084a9c4ad5a72f9fa351ee33ffcb9b2a0094 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Fri, 25 Nov 2022 16:13:16 +0100
Subject: [PATCH 262/269] StableDiffusionUpscalePipeline (#1396)

* StableDiffusionUpscalePipeline

* fix a few things

* make it better

* fix image batching

* run vae in fp32

* fix docstr

* resize to mul of 64

* doc

* remove safety_checker

* add max_noise_level

* fix Copied

* begin tests

* slow tests

* default max_noise_level

* remove kwargs

* doc

* fix

* fix fast tests

* fix fast tests

* no sf

* don't offload vae

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 .../source/api/pipelines/stable_diffusion.mdx |   7 +
 src/diffusers/__init__.py                     |   1 +
 src/diffusers/pipeline_utils.py               |   8 +-
 src/diffusers/pipelines/__init__.py           |   1 +
 .../pipelines/stable_diffusion/__init__.py    |   1 +
 .../pipeline_stable_diffusion_upscale.py      | 551 ++++++++++++++++++
 .../dummy_torch_and_transformers_objects.py   |  15 +
 .../test_stable_diffusion_upscale.py          | 315 ++++++++++
 8 files changed, 896 insertions(+), 3 deletions(-)
 create mode 100644 src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
 create mode 100644 tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py

diff --git a/docs/source/api/pipelines/stable_diffusion.mdx b/docs/source/api/pipelines/stable_diffusion.mdx
index 9884cbb20772..cd50c3d5c328 100644
--- a/docs/source/api/pipelines/stable_diffusion.mdx
+++ b/docs/source/api/pipelines/stable_diffusion.mdx
@@ -95,3 +95,10 @@ If you want to use all possible use cases in a single `DiffusionPipeline` you ca
 	- __call__
 	- enable_attention_slicing
 	- disable_attention_slicing
+
+
+## StableDiffusionUpscalePipeline
+[[autodoc]] StableDiffusionUpscalePipeline
+	- __call__
+	- enable_attention_slicing
+	- disable_attention_slicing
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 4a6661b6b393..912ae232a736 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -75,6 +75,7 @@
         StableDiffusionInpaintPipelineLegacy,
         StableDiffusionPipeline,
         StableDiffusionPipelineSafe,
+        StableDiffusionUpscalePipeline,
         VersatileDiffusionDualGuidedPipeline,
         VersatileDiffusionImageVariationPipeline,
         VersatileDiffusionPipeline,
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index d2c5516220bc..35ebd536c511 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -554,7 +554,9 @@ def load_module(name, value):
         init_dict = {k: v for k, v in init_dict.items() if load_module(k, v)}
 
         if len(unused_kwargs) > 0:
-            logger.warning(f"Keyword arguments {unused_kwargs} not recognized.")
+            logger.warning(
+                f"Keyword arguments {unused_kwargs} are not expected by {pipeline_class.__name__} and will be ignored."
+            )
 
         # import it here to avoid circular import
         from diffusers import pipelines
@@ -680,8 +682,8 @@ def load_module(name, value):
     @staticmethod
     def _get_signature_keys(obj):
         parameters = inspect.signature(obj.__init__).parameters
-        required_parameters = {k: v for k, v in parameters.items() if v.default is not True}
-        optional_parameters = set({k for k, v in parameters.items() if v.default is True})
+        required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
+        optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
         expected_modules = set(required_parameters.keys()) - set(["self"])
         return expected_modules, optional_parameters
 
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 9f4cef4b73e6..c5aba302042b 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -24,6 +24,7 @@
         StableDiffusionInpaintPipeline,
         StableDiffusionInpaintPipelineLegacy,
         StableDiffusionPipeline,
+        StableDiffusionUpscalePipeline,
     )
     from .stable_diffusion_safe import StableDiffusionPipelineSafe
     from .versatile_diffusion import (
diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
index 3c012dbab89d..0136ab565bcb 100644
--- a/src/diffusers/pipelines/stable_diffusion/__init__.py
+++ b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -40,6 +40,7 @@ class StableDiffusionPipelineOutput(BaseOutput):
     from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
     from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
     from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy
+    from .pipeline_stable_diffusion_upscale import StableDiffusionUpscalePipeline
     from .safety_checker import StableDiffusionSafetyChecker
 
 if is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0.dev0"):
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
new file mode 100644
index 000000000000..7ccb43d46c14
--- /dev/null
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -0,0 +1,551 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+
+import PIL
+from diffusers.utils import is_accelerate_available
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ...schedulers import DDIMScheduler, DDPMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def preprocess(image):
+    # resize to multiple of 64
+    width, height = image.size
+    width = width - width % 64
+    height = height - height % 64
+    image = image.resize((width, height))
+
+    image = np.array(image.convert("RGB"))
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+    return image
+
+
+class StableDiffusionUpscalePipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-guided image super-resolution using Stable Diffusion 2.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        low_res_scheduler ([`SchedulerMixin`]):
+            A scheduler used to add initial noise to the low res conditioning image. It must be an instance of
+            [`DDPMScheduler`].
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        low_res_scheduler: DDPMScheduler,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        max_noise_level: int = 350,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            low_res_scheduler=low_res_scheduler,
+            scheduler=scheduler,
+        )
+        self.register_to_config(max_noise_level=max_noise_level)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_attention_slicing
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            if isinstance(self.unet.config.attention_head_dim, int):
+                # half the attention head size is usually a good trade-off between
+                # speed and memory
+                slice_size = self.unet.config.attention_head_dim // 2
+            else:
+                # if `attention_head_dim` is a list, take the smallest head size
+                slice_size = min(self.unet.config.attention_head_dim)
+
+        self.unet.set_attention_slice(slice_size)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_attention_slicing
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        for cpu_offloaded_model in [self.unet, self.text_encoder]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_xformers_memory_efficient_attention
+    def enable_xformers_memory_efficient_attention(self):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(True)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_xformers_memory_efficient_attention
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.unet.set_use_memory_efficient_attention_xformers(False)
+
+    @property
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
+
+        if not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        return text_embeddings
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents with 0.18215->0.08333
+    def decode_latents(self, latents):
+        latents = 1 / 0.08333 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def check_inputs(self, prompt, image, noise_level, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}"
+            )
+
+        # verify batch size of prompt and image are same if image is a list or tensor
+        if isinstance(image, list) or isinstance(image, torch.Tensor):
+            if isinstance(prompt, str):
+                batch_size = 1
+            else:
+                batch_size = len(prompt)
+            if isinstance(image, list):
+                image_batch_size = len(image)
+            else:
+                image_batch_size = image.shape[0]
+            if batch_size != image_batch_size:
+                raise ValueError(
+                    f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}."
+                    " Please make sure that passed `prompt` matches the batch size of `image`."
+                )
+
+        # check noise level
+        if noise_level > self.config.max_noise_level:
+            raise ValueError(f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height, width)
+        if latents is None:
+            if device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+            else:
+                latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.FloatTensor, PIL.Image.Image, List[PIL.Image.Image]],
+        num_inference_steps: int = 75,
+        guidance_scale: float = 9.0,
+        noise_level: int = 20,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.FloatTensor`):
+                `Image`, or tensor representing an image batch which will be upscaled. *
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # 1. Check inputs
+        self.check_inputs(prompt, image, noise_level, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        # 4. Preprocess image
+        image = [image] if isinstance(image, PIL.Image.Image) else image
+        if isinstance(image, list):
+            image = [preprocess(img) for img in image]
+            image = torch.cat(image, dim=0)
+        image = image.to(dtype=text_embeddings.dtype, device=device)
+
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps_tensor = self.scheduler.timesteps
+
+        # 5. Add noise to image
+        noise_level = torch.tensor([noise_level], dtype=torch.long, device=device)
+        if device.type == "mps":
+            # randn does not work reproducibly on mps
+            noise = torch.randn(image.shape, generator=generator, device="cpu", dtype=text_embeddings.dtype).to(device)
+        else:
+            noise = torch.randn(image.shape, generator=generator, device=device, dtype=text_embeddings.dtype)
+        image = self.low_res_scheduler.add_noise(image, noise, noise_level)
+        image = torch.cat([image] * 2) if do_classifier_free_guidance else image
+        noise_level = torch.cat([noise_level] * 2) if do_classifier_free_guidance else noise_level
+
+        # 6. Prepare latent variables
+        height, width = image.shape[2:]
+        num_channels_latents = self.vae.config.latent_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 7. Check that sizes of image and latents match
+        num_channels_image = image.shape[1]
+        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_image`: {num_channels_image} "
+                f" = {num_channels_latents+num_channels_image}. Please verify the config of"
+                " `pipeline.unet` or your `image` input."
+            )
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 9. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            # concat latents, mask, masked_image_latents in the channel dimension
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            latent_model_input = torch.cat([latent_model_input, image], dim=1)
+
+            # predict the noise residual
+            noise_pred = self.unet(
+                latent_model_input, t, encoder_hidden_states=text_embeddings, class_labels=noise_level
+            ).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+
+        # 10. Post-processing
+        # make sure the VAE is in float32 mode, as it overflows in float16
+        self.vae.to(dtype=torch.float32)
+        image = self.decode_latents(latents.float())
+
+        # 11. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index d255c174c743..2d932d240508 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -154,6 +154,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class StableDiffusionUpscalePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class VersatileDiffusionDualGuidedPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
new file mode 100644
index 000000000000..2092e153eeb2
--- /dev/null
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
@@ -0,0 +1,315 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import random
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, StableDiffusionUpscalePipeline, UNet2DConditionModel
+from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
+from diffusers.utils.testing_utils import require_torch_gpu
+from PIL import Image
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class StableDiffusionUpscalePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @property
+    def dummy_image(self):
+        batch_size = 1
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
+        return image
+
+    @property
+    def dummy_cond_unet_upscale(self):
+        torch.manual_seed(0)
+        model = UNet2DConditionModel(
+            block_out_channels=(32, 32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=7,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            # SD2-specific config below
+            attention_head_dim=8,
+            use_linear_projection=True,
+            only_cross_attention=(True, True, False),
+            num_class_embeds=100,
+        )
+        return model
+
+    @property
+    def dummy_vae(self):
+        torch.manual_seed(0)
+        model = AutoencoderKL(
+            block_out_channels=[32, 32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        return model
+
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+            # SD2-specific config below
+            hidden_act="gelu",
+            projection_dim=512,
+        )
+        return CLIPTextModel(config)
+
+    def test_stable_diffusion_upscale(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        unet = self.dummy_cond_unet_upscale
+        low_res_scheduler = DDPMScheduler()
+        scheduler = DDIMScheduler(prediction_type="v_prediction")
+        vae = self.dummy_vae
+        text_encoder = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionUpscalePipeline(
+            unet=unet,
+            low_res_scheduler=low_res_scheduler,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            max_noise_level=350,
+        )
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=device).manual_seed(0)
+        output = sd_pipe(
+            [prompt],
+            image=low_res_image,
+            generator=generator,
+            guidance_scale=6.0,
+            noise_level=20,
+            num_inference_steps=2,
+            output_type="np",
+        )
+
+        image = output.images
+
+        generator = torch.Generator(device=device).manual_seed(0)
+        image_from_tuple = sd_pipe(
+            [prompt],
+            image=low_res_image,
+            generator=generator,
+            guidance_scale=6.0,
+            noise_level=20,
+            num_inference_steps=2,
+            output_type="np",
+            return_dict=False,
+        )[0]
+
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+
+        expected_height_width = low_res_image.size[0] * 4
+        assert image.shape == (1, expected_height_width, expected_height_width, 3)
+        expected_slice = np.array([0.2562, 0.3606, 0.4204, 0.4469, 0.4822, 0.4647, 0.5315, 0.5748, 0.5606])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+
+    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
+    def test_stable_diffusion_upscale_fp16(self):
+        """Test that stable diffusion upscale works with fp16"""
+        unet = self.dummy_cond_unet_upscale
+        low_res_scheduler = DDPMScheduler()
+        scheduler = DDIMScheduler(prediction_type="v_prediction")
+        vae = self.dummy_vae
+        text_encoder = self.dummy_text_encoder
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
+        low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
+
+        # put models in fp16, except vae as it overflows in fp16
+        unet = unet.half()
+        text_encoder = text_encoder.half()
+
+        # make sure here that pndm scheduler skips prk
+        sd_pipe = StableDiffusionUpscalePipeline(
+            unet=unet,
+            low_res_scheduler=low_res_scheduler,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            max_noise_level=350,
+        )
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = sd_pipe(
+            [prompt],
+            image=low_res_image,
+            generator=generator,
+            num_inference_steps=2,
+            output_type="np",
+        ).images
+
+        expected_height_width = low_res_image.size[0] * 4
+        assert image.shape == (1, expected_height_width, expected_height_width, 3)
+
+
+@slow
+@require_torch_gpu
+class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_stable_diffusion_upscale_pipeline(self):
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/sd2-upscale/low_res_cat.png"
+        )
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale"
+            "/upsampled_cat.npy"
+        )
+
+        model_id = "stabilityai/stable-diffusion-x4-upscaler"
+        pipe = StableDiffusionUpscalePipeline.from_pretrained(model_id)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "a cat sitting on a park bench"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            image=image,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 1e-3
+
+    def test_stable_diffusion_upscale_pipeline_fp16(self):
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/sd2-upscale/low_res_cat.png"
+        )
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale"
+            "/upsampled_cat_fp16.npy"
+        )
+
+        model_id = "stabilityai/stable-diffusion-x4-upscaler"
+        pipe = StableDiffusionUpscalePipeline.from_pretrained(
+            model_id,
+            revision="fp16",
+            torch_dtype=torch.float16,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing()
+
+        prompt = "a cat sitting on a park bench"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipe(
+            prompt=prompt,
+            image=image,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+
+        assert image.shape == (512, 512, 3)
+        assert np.abs(expected_image - image).max() < 5e-1
+
+    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_max_memory_allocated()
+        torch.cuda.reset_peak_memory_stats()
+
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/sd2-upscale/low_res_cat.png"
+        )
+
+        model_id = "stabilityai/stable-diffusion-x4-upscaler"
+        pipe = StableDiffusionUpscalePipeline.from_pretrained(
+            model_id,
+            revision="fp16",
+            torch_dtype=torch.float16,
+        )
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.enable_attention_slicing(1)
+        pipe.enable_sequential_cpu_offload()
+
+        prompt = "a cat sitting on a park bench"
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        _ = pipe(
+            prompt=prompt,
+            image=image,
+            generator=generator,
+            num_inference_steps=5,
+            output_type="np",
+        )
+
+        mem_bytes = torch.cuda.max_memory_allocated()
+        # make sure that less than 2.65 GB is allocated
+        assert mem_bytes < 2.65 * 10**9

From 520bb082be33ab9eda43660bf0853b5d4a1854c6 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 25 Nov 2022 15:15:05 +0000
Subject: [PATCH 263/269] fixes tests

---
 tests/pipelines/stable_diffusion_2/test_stable_diffusion.py     | 2 +-
 .../versatile_diffusion/test_versatile_diffusion_mega.py        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
index e1d22662cd83..52bebe419b0b 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
@@ -674,7 +674,7 @@ def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> No
                 latents = latents.detach().cpu().numpy()
                 assert latents.shape == (1, 4, 64, 64)
                 latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([1.078, 1.1804, 1.1339, 0.4664, -0.2354, 0.6097, -0.7749, -0.8784, -0.9465])
+                expected_slice = np.array([1.0757, 1.1860, 1.1410, 0.4645, -0.2476, 0.6100, -0.7755, -0.8841, -0.9497])
                 assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-2
 
         test_callback_fn.has_been_called = False
diff --git a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
index c69799c9d482..ab4580dae1fe 100644
--- a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
+++ b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
@@ -124,5 +124,5 @@ def test_inference_dual_guided_then_text_to_image(self):
         image_slice = image[0, 253:256, 253:256, -1]
 
         assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.3479, 0.1943, 0.1060, 0.3894, 0.2537, 0.1394, 0.3989, 0.3191, 0.1987])
+        expected_slice = np.array([0.3403, 0.1809, 0.0938, 0.3855, 0.2393, 0.1243, 0.4028, 0.3110, 0.1799])
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

From b9e921feea53524038cf40a836d9b48b80846934 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 25 Nov 2022 17:12:58 +0100
Subject: [PATCH 264/269] added initial v-pred support to DPM-solver (#1421)

* added initial v-pred support to DPM-solver

* fix sign

* added v_prediction to flax

* fixed typo
---
 .../scheduling_dpmsolver_multistep.py         | 21 ++++++++++++-------
 .../scheduling_dpmsolver_multistep_flax.py    | 21 ++++++++++++-------
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
index d38ceed281ed..76dc7acc1b9f 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -88,8 +88,8 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
             the order of DPM-Solver; can be `1` or `2` or `3`. We recommend to use `solver_order=2` for guided
             sampling, and `solver_order=3` for unconditional sampling.
         prediction_type (`str`, default `epsilon`):
-            indicates whether the model predicts the noise (epsilon), or the data / `x0`. One of `epsilon`, `sample`.
-            `v-prediction` is not supported for this scheduler.
+            indicates whether the model predicts the noise (epsilon), or the data / `x0`. One of `epsilon`, `sample`,
+            or `v-prediction`.
         thresholding (`bool`, default `False`):
             whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
             For pixel-space diffusion models, you can set both `algorithm_type=dpmsolver++` and `thresholding=True` to
@@ -212,7 +212,7 @@ def convert_model_output(
         """
         Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs.
 
-        DPM-Solver is designed to discretize an integral of the noise prediciton model, and DPM-Solver++ is designed to
+        DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to
         discretize an integral of the data prediction model. So we need to first convert the model output to the
         corresponding type to match the algorithm.
 
@@ -235,10 +235,13 @@ def convert_model_output(
                 x0_pred = (sample - sigma_t * model_output) / alpha_t
             elif self.config.prediction_type == "sample":
                 x0_pred = model_output
+            elif self.config.prediction_type == "v_prediction":
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                x0_pred = alpha_t * sample - sigma_t * model_output
             else:
                 raise ValueError(
-                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` "
-                    " for the DPMSolverMultistepScheduler."
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the DPMSolverMultistepScheduler."
                 )
 
             if self.config.thresholding:
@@ -260,10 +263,14 @@ def convert_model_output(
                 alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 epsilon = (sample - alpha_t * model_output) / sigma_t
                 return epsilon
+            elif self.config.prediction_type == "v_prediction":
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                epsilon = alpha_t * model_output + sigma_t * sample
+                return epsilon
             else:
                 raise ValueError(
-                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` "
-                    " for the DPMSolverMultistepScheduler."
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the DPMSolverMultistepScheduler."
                 )
 
     def dpm_solver_first_order_update(
diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
index 4d56d99a8cd8..78b611ae2721 100644
--- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
+++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
@@ -120,8 +120,8 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
             the order of DPM-Solver; can be `1` or `2` or `3`. We recommend to use `solver_order=2` for guided
             sampling, and `solver_order=3` for unconditional sampling.
         prediction_type (`str`, default `epsilon`):
-            indicates whether the model predicts the noise (epsilon), or the data / `x0`. One of `epsilon`, `sample`.
-            `v-prediction` is not supported for this scheduler.
+            indicates whether the model predicts the noise (epsilon), or the data / `x0`. One of `epsilon`, `sample`,
+            or `v-prediction`.
         thresholding (`bool`, default `False`):
             whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
             For pixel-space diffusion models, you can set both `algorithm_type=dpmsolver++` and `thresholding=True` to
@@ -252,7 +252,7 @@ def convert_model_output(
         """
         Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs.
 
-        DPM-Solver is designed to discretize an integral of the noise prediciton model, and DPM-Solver++ is designed to
+        DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to
         discretize an integral of the data prediction model. So we need to first convert the model output to the
         corresponding type to match the algorithm.
 
@@ -275,10 +275,13 @@ def convert_model_output(
                 x0_pred = (sample - sigma_t * model_output) / alpha_t
             elif self.config.prediction_type == "sample":
                 x0_pred = model_output
+            elif self.config.prediction_type == "v_prediction":
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                x0_pred = alpha_t * sample - sigma_t * model_output
             else:
                 raise ValueError(
-                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` "
-                    " for the FlaxDPMSolverMultistepScheduler."
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, "
+                    " or `v_prediction` for the FlaxDPMSolverMultistepScheduler."
                 )
 
             if self.config.thresholding:
@@ -299,10 +302,14 @@ def convert_model_output(
                 alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
                 epsilon = (sample - alpha_t * model_output) / sigma_t
                 return epsilon
+            elif self.config.prediction_type == "v_prediction":
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                epsilon = alpha_t * model_output + sigma_t * sample
+                return epsilon
             else:
                 raise ValueError(
-                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` "
-                    " for the FlaxDPMSolverMultistepScheduler."
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, "
+                    " or `v_prediction` for the FlaxDPMSolverMultistepScheduler."
                 )
 
     def dpm_solver_first_order_update(

From 6883294d4450c637b51e9658d1ab503dcc5fa696 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 25 Nov 2022 17:23:21 +0100
Subject: [PATCH 265/269] SD2 docs (#1424)

* up

* up

* up

* up
---
 docs/source/_toctree.yml                      |   2 +
 docs/source/api/pipelines/overview.mdx        |   3 +
 .../api/pipelines/stable_diffusion_2.mdx      | 142 ++++++++++++++++++
 docs/source/index.mdx                         |   3 +
 4 files changed, 150 insertions(+)
 create mode 100644 docs/source/api/pipelines/stable_diffusion_2.mdx

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index bf23d363a863..957144488323 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -106,6 +106,8 @@
       title: "Score SDE VE"
     - local: api/pipelines/stable_diffusion
       title: "Stable Diffusion"
+    - local: api/pipelines/stable_diffusion_2
+      title: "Stable Diffusion 2"
     - local: api/pipelines/stable_diffusion_safe
       title: "Safe Stable Diffusion"
     - local: api/pipelines/stochastic_karras_ve
diff --git a/docs/source/api/pipelines/overview.mdx b/docs/source/api/pipelines/overview.mdx
index c43f09d66dde..eed8e0d0b020 100644
--- a/docs/source/api/pipelines/overview.mdx
+++ b/docs/source/api/pipelines/overview.mdx
@@ -58,6 +58,9 @@ available a colab notebook to directly try them out.
 | [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
 | [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
 | [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation | 
+| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting | 
+| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
 | [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
 | [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation |
 | [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation | 
diff --git a/docs/source/api/pipelines/stable_diffusion_2.mdx b/docs/source/api/pipelines/stable_diffusion_2.mdx
new file mode 100644
index 000000000000..81a410e222c9
--- /dev/null
+++ b/docs/source/api/pipelines/stable_diffusion_2.mdx
@@ -0,0 +1,142 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Stable diffusion 2
+
+Stable Diffusion 2 is a text-to-image _latent diffusion_ model built upon the work of [Stable Diffusion 1](https://stability.ai/blog/stable-diffusion-public-release). 
+The project to train Stable Diffusion 2 was led by Robin Rombach and Katherine Crowson from [Stability AI](https://stability.ai/) and [LAION](https://laion.ai/).
+
+*The Stable Diffusion 2.0 release includes robust text-to-image models trained using a brand new text encoder (OpenCLIP), developed by LAION with support from Stability AI, which greatly improves the quality of the generated images compared to earlier V1 releases. The text-to-image models in this release can generate images with default resolutions of both 512x512 pixels and 768x768 pixels. 
+These models are trained on an aesthetic subset of the [LAION-5B dataset](https://laion.ai/blog/laion-5b/) created by the DeepFloyd team at Stability AI, which is then further filtered to remove adult content using [LAION’s NSFW filter](https://openreview.net/forum?id=M3Y74vmsMcY).*
+
+For more details about how Stable Diffusion 2 works and how it differs from Stable Diffusion 1, please refer to the official [launch announcement post](https://stability.ai/blog/stable-diffusion-v2-release).
+
+## Tips
+
+### Avaiblable checkpoints:
+
+Note that the architecture is more or less identical to [Stable Diffusion 1](./api/pipelines/stable_diffusion) so please refer to [this page](./api/pipelines/stable_diffusion) for API documentation.
+
+- *Text-to-Image (512x512 resolution)*: [stabilityai/stable-diffusion-2-base](https://huggingface.co/stabilityai/stable-diffusion-2-base) with [`StableDiffusionPipeline`]
+- *Text-to-Image (768x768 resolution)*: [stabilityai/stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) with [`StableDiffusionPipeline`]
+- *Image Inpainting (512x512 resolution)*: [stabilityai/stable-diffusion-2-inpainting](https://huggingface.co/stabilityai/stable-diffusion-2-inpainting) with [`StableDiffusionInpaintPipeline`]
+- *Image Upscaling (x4 resolution resolution)*: [stable-diffusion-x4-upscaler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler) [`StableDiffusionUpscalePipeline`]
+
+We recommend using the [`DPMSolverMultistepScheduler`] as it's currently the fastest scheduler there is.
+
+- *Text-to-Image (512x512 resolution)*:
+
+```python
+from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
+import torch
+
+repo_id = "stabilityai/stable-diffusion-2-base"
+pipe = DiffusionPipeline.from_pretrained(repo_id, torch_dtype=torch.float16, revision="fp16")
+
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to("cuda")
+
+prompt = "High quality photo of an astronaut riding a horse in space"
+image = pipe(prompt, num_inference_steps=25).images[0]
+image.save("astronaut.png")
+```
+
+- *Text-to-Image (768x768 resolution)*:
+
+```python
+from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
+import torch
+
+repo_id = "stabilityai/stable-diffusion-2"
+pipe = DiffusionPipeline.from_pretrained(repo_id, torch_dtype=torch.float16, revision="fp16")
+
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to("cuda")
+
+prompt = "High quality photo of an astronaut riding a horse in space"
+image = pipe(prompt, guidance_scale=9, num_inference_steps=25).images[0]
+image.save("astronaut.png")
+```
+
+- *Image Inpainting (512x512 resolution)*:
+
+```python
+import PIL
+import requests
+import torch
+from io import BytesIO
+
+from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
+
+
+def download_image(url):
+    response = requests.get(url)
+    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+init_image = download_image(img_url).resize((512, 512))
+mask_image = download_image(mask_url).resize((512, 512))
+
+repo_id = "stabilityai/stable-diffusion-2-inpainting"
+pipe = DiffusionPipeline.from_pretrained(repo_id, torch_dtype=torch.float16, revision="fp16")
+
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to("cuda")
+
+prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+image = pipe(prompt=prompt, image=init_image, mask_image=mask_image, num_inference_steps=25).images[0]
+
+image.save("yellow_cat.png")
+```
+
+- *Image Upscaling (x4 resolution resolution)*: [stable-diffusion-x4-upscaler](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler) [`StableDiffusionUpscalePipeline`]
+
+```python
+import requests
+from PIL import Image
+from io import BytesIO
+from diffusers import StableDiffusionUpscalePipeline
+import torch
+
+# load model and scheduler
+model_id = "stabilityai/stable-diffusion-x4-upscaler"
+pipeline = StableDiffusionUpscalePipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16)
+pipeline = pipeline.to("cuda")
+
+# let's download an  image
+url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale/low_res_cat.png"
+response = requests.get(url)
+low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
+low_res_img = low_res_img.resize((128, 128))
+prompt = "a white cat"
+upscaled_image = pipeline(prompt=prompt, image=low_res_img).images[0]
+upscaled_image.save("upsampled_cat.png")
+```
+
+### How to load and use different schedulers.
+
+The stable diffusion pipeline uses [`DDIMScheduler`] scheduler by default. But `diffusers` provides many other schedulers that can be used with the stable diffusion pipeline such as [`PNDMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`] etc.
+To use a different scheduler, you can either change it via the [`ConfigMixin.from_config`] method or pass the `scheduler` argument to the `from_pretrained` method of the pipeline. For example, to use the [`EulerDiscreteScheduler`], you can do the following:
+
+```python
+>>> from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
+
+>>> pipeline = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
+>>> pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
+
+>>> # or
+>>> euler_scheduler = EulerDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-2", subfolder="scheduler")
+>>> pipeline = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=euler_scheduler)
+```
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 09cc59fda99c..975ff47b61e6 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -48,6 +48,9 @@ available a colab notebook to directly try them out.
 | [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
 | [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
 | [stable_diffusion](./api/pipelines/stable_diffusion) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation | 
+| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting | 
+| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image |
 | [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb)
 | [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation | 
 | [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation | 

From 462a79d39ad278090fbe5fc723d5a2c4d22185b9 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 25 Nov 2022 17:44:07 +0100
Subject: [PATCH 266/269] [Docs] fixed some typos (#1425)

fixed typos
---
 docs/source/api/pipelines/alt_diffusion.mdx      | 2 +-
 docs/source/api/pipelines/stable_diffusion.mdx   | 2 +-
 docs/source/api/pipelines/stable_diffusion_2.mdx | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/api/pipelines/alt_diffusion.mdx b/docs/source/api/pipelines/alt_diffusion.mdx
index 4a75bc09bfa2..8d7d795d7633 100644
--- a/docs/source/api/pipelines/alt_diffusion.mdx
+++ b/docs/source/api/pipelines/alt_diffusion.mdx
@@ -51,7 +51,7 @@ To use a different scheduler, you can either change it via the [`ConfigMixin.fro
 ```
 
 
-- *How to conver all use cases with multiple or single pipeline*
+- *How to convert all use cases with multiple or single pipeline*
 
 If you want to use all possible use cases in a single `DiffusionPipeline` we recommend using the `components` functionality to instantiate all components in the most memory-efficient way:
 
diff --git a/docs/source/api/pipelines/stable_diffusion.mdx b/docs/source/api/pipelines/stable_diffusion.mdx
index cd50c3d5c328..afa72775f06a 100644
--- a/docs/source/api/pipelines/stable_diffusion.mdx
+++ b/docs/source/api/pipelines/stable_diffusion.mdx
@@ -48,7 +48,7 @@ To use a different scheduler, you can either change it via the [`ConfigMixin.fro
 ```
 
 
-### How to conver all use cases with multiple or single pipeline
+### How to convert all use cases with multiple or single pipeline
 
 If you want to use all possible use cases in a single `DiffusionPipeline` you can either:
 - Make use of the [Stable Diffusion Mega Pipeline](https://github.com/huggingface/diffusers/tree/main/examples/community#stable-diffusion-mega) or 
diff --git a/docs/source/api/pipelines/stable_diffusion_2.mdx b/docs/source/api/pipelines/stable_diffusion_2.mdx
index 81a410e222c9..5df9195034c3 100644
--- a/docs/source/api/pipelines/stable_diffusion_2.mdx
+++ b/docs/source/api/pipelines/stable_diffusion_2.mdx
@@ -22,7 +22,7 @@ For more details about how Stable Diffusion 2 works and how it differs from Stab
 
 ## Tips
 
-### Avaiblable checkpoints:
+### Available checkpoints:
 
 Note that the architecture is more or less identical to [Stable Diffusion 1](./api/pipelines/stable_diffusion) so please refer to [this page](./api/pipelines/stable_diffusion) for API documentation.
 

From 6b02323a602a66841729c3a5d60844b24aa81ff2 Mon Sep 17 00:00:00 2001
From: anton- <anton@huggingface.co>
Date: Fri, 25 Nov 2022 17:47:36 +0100
Subject: [PATCH 267/269] Release: v0.9.0

---
 setup.py                  | 2 +-
 src/diffusers/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index c6f2725be1e1..9148acce2613 100644
--- a/setup.py
+++ b/setup.py
@@ -212,7 +212,7 @@ def run(self):
 
 setup(
     name="diffusers",
-    version="0.9.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="0.9.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
     description="Diffusers",
     long_description=open("README.md", "r", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 912ae232a736..256eb8fee8bc 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -9,7 +9,7 @@
 )
 
 
-__version__ = "0.9.0.dev0"
+__version__ = "0.9.0"
 
 from .configuration_utils import ConfigMixin
 from .onnx_utils import OnnxRuntimeModel

From b73e8b53bc8ebe4c4351af1d2e8e065fc41feac2 Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Thu, 1 Dec 2022 13:47:12 +0100
Subject: [PATCH 268/269] sync

---
 src/diffusers/models/attention.py | 224 ++++++++++++++++++++++++++++++
 1 file changed, 224 insertions(+)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 6081f0d8b75f..314ba291843a 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -20,6 +20,12 @@
 import torch.nn.functional as F
 from torch import nn
 
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..modeling_utils import ModelMixin
+from ..models.embeddings import ImagePositionalEmbeddings
+from ..utils import BaseOutput
+from ..utils.import_utils import is_xformers_available
+
 from einops import rearrange
 
 # TODO(danfu): upstream this to FlashAttention
@@ -43,6 +49,224 @@ def check_cuda():
 except ImportError:
     flash_attn_installed = False
 
+@dataclass
+class Transformer2DModelOutput(BaseOutput):
+    """
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            Hidden states conditioned on `encoder_hidden_states` input. If discrete, returns probability distributions
+            for the unnoised latent pixels.
+    """
+
+    sample: torch.FloatTensor
+
+
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+
+
+class Transformer2DModel(ModelMixin, ConfigMixin):
+    """
+    Transformer model for image-like data. Takes either discrete (classes of vector embeddings) or continuous (actual
+    embeddings) inputs.
+    When input is continuous: First, project the input (aka embedding) and reshape to b, t, d. Then apply standard
+    transformer action. Finally, reshape to image.
+    When input is discrete: First, input (classes of latent pixels) is converted to embeddings and has positional
+    embeddings applied, see `ImagePositionalEmbeddings`. Then apply standard transformer action. Finally, predict
+    classes of unnoised image.
+    Note that it is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised
+    image do not contain a prediction for the masked pixel as the unnoised image cannot be masked.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            Pass if the input is continuous. The number of channels in the input and output.
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.1): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of context dimensions to use.
+        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
+            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
+            `ImagePositionalEmbeddings`.
+        num_vector_embeds (`int`, *optional*):
+            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
+            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
+            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
+            up to but not more than steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the TransformerBlocks' attention should contain a bias parameter.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+
+        # 1. Transformer2DModel can process both standard continous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+        # Define whether input is continuous or discrete depending on configuration
+        self.is_input_continuous = in_channels is not None
+        self.is_input_vectorized = num_vector_embeds is not None
+
+        if self.is_input_continuous and self.is_input_vectorized:
+            raise ValueError(
+                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is None."
+            )
+        elif not self.is_input_continuous and not self.is_input_vectorized:
+            raise ValueError(
+                f"Has to define either `in_channels`: {in_channels} or `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is not None."
+            )
+
+        # 2. Define input layers
+        if self.is_input_continuous:
+            self.in_channels = in_channels
+
+            self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+            if use_linear_projection:
+                self.proj_in = nn.Linear(in_channels, inner_dim)
+            else:
+                self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
+
+            self.height = sample_size
+            self.width = sample_size
+            self.num_vector_embeds = num_vector_embeds
+            self.num_latent_pixels = self.height * self.width
+
+            self.latent_image_embedding = ImagePositionalEmbeddings(
+                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
+            )
+
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                )
+                for d in range(num_layers)
+            ]
+        )
+
+        # 4. Define output layers
+        if self.is_input_continuous:
+            if use_linear_projection:
+                self.proj_out = nn.Linear(in_channels, inner_dim)
+            else:
+                self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            self.norm_out = nn.LayerNorm(inner_dim)
+            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
+
+    def _set_attention_slice(self, slice_size):
+        for block in self.transformer_blocks:
+            block._set_attention_slice(slice_size)
+
+    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, return_dict: bool = True):
+        """
+        Args:
+            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
+                When continous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
+                hidden_states
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, context dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.long`, *optional*):
+                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.attention.Transformer2DModelOutput`] or `tuple`: [`~models.attention.Transformer2DModelOutput`]
+            if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample
+            tensor.
+        """
+        # 1. Input
+        if self.is_input_continuous:
+            batch, channel, height, weight = hidden_states.shape
+            residual = hidden_states
+
+            hidden_states = self.norm(hidden_states)
+            if not self.use_linear_projection:
+                hidden_states = self.proj_in(hidden_states)
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+            else:
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+                hidden_states = self.proj_in(hidden_states)
+        elif self.is_input_vectorized:
+            hidden_states = self.latent_image_embedding(hidden_states)
+
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(hidden_states, context=encoder_hidden_states, timestep=timestep)
+
+        # 3. Output
+        if self.is_input_continuous:
+            if not self.use_linear_projection:
+                hidden_states = (
+                    hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+                )
+                hidden_states = self.proj_out(hidden_states)
+            else:
+                hidden_states = self.proj_out(hidden_states)
+                hidden_states = (
+                    hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+                )
+
+            output = hidden_states + residual
+        elif self.is_input_vectorized:
+            hidden_states = self.norm_out(hidden_states)
+            logits = self.out(hidden_states)
+            # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+            logits = logits.permute(0, 2, 1)
+
+            # log(p(x_0))
+            output = F.log_softmax(logits.double(), dim=1).float()
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
+
+    def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for block in self.transformer_blocks:
+            block._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+
 
 class AttentionBlock(nn.Module):
     """

From 7f12ed8fd3b8feaa34eff037d107cee6e741de2b Mon Sep 17 00:00:00 2001
From: Xiaozhe Yao <askxzyao@gmail.com>
Date: Thu, 1 Dec 2022 13:54:27 +0100
Subject: [PATCH 269/269] minor

---
 src/diffusers/models/attention.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 314ba291843a..2bbc1a9c41de 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -541,15 +541,15 @@ def __init__(
         self._use_memory_efficient_attention_xformers = False
 
         self.dim_head = dim_head
-        self.context_dim = context_dim
+        self.context_dim = cross_attention_dim
         self.query_dim = query_dim
 
         if self.context_dim == self.query_dim and self.dim_head <= 128 and (self.dim_head % 8) == 0 and flash_attn_installed:
             self.flash_attn = FlashAttention(self.scale)
 
         self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
-        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=False)
 
         self.to_out = nn.ModuleList([])
         self.to_out.append(nn.Linear(inner_dim, query_dim))