In [None]:
# 不需要使用accelerator 因为我们只有一个gpu

In [None]:
!git clone https://github.com/huggingface/diffusers
%cd diffusers
!pip install .

In [3]:
%cd examples/text_to_image

/content/diffusers/examples/text_to_image


In [None]:
!pip install -r requirements.txt

In [None]:
! pip install wandb

In [None]:
import wandb
wandb.login()

In [7]:
import argparse
import logging
import math
import os
import random
import shutil
from pathlib import Path

import datasets
import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
import transformers
from accelerate import Accelerator
# 如何使用get_logger获得log信息？
from accelerate.logging import get_logger
from accelerate.utils import ProjectConfiguration, set_seed
from datasets import load_dataset
from huggingface_hub import create_repo, upload_folder
from packaging import version
from torchvision import transforms
# tqdm 是一个用于在 Python 中显示进度条的库，它通常在循环中使用，以便用户可以实时看到代码的执行进度。
from tqdm.auto import tqdm
# CLIPTextModel： 它是一个用于处理文本的预训练模型，可以接收输入文本并生成对应的文本表示。你可以使用这个模型来提取文本特征或进行文本相关的任务
# CLIPTokenizer： 用于对输入文本进行标记化（tokenization）的 CLIP 模型的标记器
from transformers import CLIPTextModel, CLIPTokenizer

import diffusers
from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
from diffusers.models.lora import LoRALinearLayer
from diffusers.optimization import get_scheduler
from diffusers.training_utils import compute_snr
from diffusers.utils import check_min_version, is_wandb_available
from diffusers.utils.import_utils import is_xformers_available


# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
check_min_version("0.24.0.dev0")

logger = get_logger(__name__, log_level="INFO")

In [8]:
# TODO: This function should be removed once training scripts are rewritten in PEFT
def text_encoder_lora_state_dict(text_encoder):
    state_dict = {}

    def text_encoder_attn_modules(text_encoder):
        from transformers import CLIPTextModel, CLIPTextModelWithProjection

        attn_modules = []

        if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)):
            for i, layer in enumerate(text_encoder.text_model.encoder.layers):
                name = f"text_model.encoder.layers.{i}.self_attn"
                mod = layer.self_attn
                attn_modules.append((name, mod))

        return attn_modules

    for name, module in text_encoder_attn_modules(text_encoder):
        for k, v in module.q_proj.lora_linear_layer.state_dict().items():
            state_dict[f"{name}.q_proj.lora_linear_layer.{k}"] = v

        for k, v in module.k_proj.lora_linear_layer.state_dict().items():
            state_dict[f"{name}.k_proj.lora_linear_layer.{k}"] = v

        for k, v in module.v_proj.lora_linear_layer.state_dict().items():
            state_dict[f"{name}.v_proj.lora_linear_layer.{k}"] = v

        for k, v in module.out_proj.lora_linear_layer.state_dict().items():
            state_dict[f"{name}.out_proj.lora_linear_layer.{k}"] = v

    return state_dict


def save_model_card(repo_id: str, images=None, base_model=str, dataset_name=str, repo_folder=None):
    img_str = ""
    for i, image in enumerate(images):
        image.save(os.path.join(repo_folder, f"image_{i}.png"))
        img_str += f"![img_{i}](./image_{i}.png)\n"

    yaml = f"""
---
license: creativeml-openrail-m
base_model: {base_model}
tags:
- stable-diffusion
- stable-diffusion-diffusers
- text-to-image
- diffusers
- lora
inference: true
---
    """
    model_card = f"""
# LoRA text2image fine-tuning - {repo_id}
These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n
{img_str}
"""
    with open(os.path.join(repo_folder, "README.md"), "w") as f:
        f.write(yaml + model_card)

In [41]:
pretrained_model_name_or_path = "runwayml/stable-diffusion-v1-5"
#dataset_name = "lambdalabs/pokemon-blip-captions"
dataset_name = None
revision = None
#train_data_dir = None
### TODO: change it to the directory where you save your images and csv ###
train_data_dir = "/content/drive/MyDrive/cs182-master/emoji_image/emoji"
variant = None
dataset_config_name = None
image_column = "image"
caption_column = "text"
#validation_prompt = "Totoro"  # 这个可以多加几个吗，可以用list保存
# validation_prompt = ["grinning face with sweat in Apple style",
#           "grinning face with sweat in Facebook style",
#           "grinning face with sweat in Google style",
#           "grinning face with sweat in JoyPixels style",
#          "grinning face with sweat in Samsung style",
#         "grinning face with sweat in Twitter style",
#         "grinning face with sweat in Windows style"]
validation_prompt = ["grinning face with sweat in Apple style"]
num_validation_images = 1
validation_epochs = 1
max_train_samples = None
### TODO: change it to the directory where you save your output model ###
output_dir = "/content/drive/MyDrive/huggingface_diffuser/model/checkpoints/"
cache_dir = None
resolution = 256
center_crop = False
random_flip = False
train_batch_size = 1
gradient_accumulation_steps = 1
### TODO: can change those two ###
max_train_steps = 15000
num_train_epochs = 100
gradient_checkpointing = False
learning_rate = 1e-4
scale_lr = False
lr_scheduler = "cosine"
lr_warmup_steps = 0
snr_gamma = None
use_8bit_adam = False
allow_tf32 = False
dataloader_num_workers = 2
adam_beta1 = 0.9
adam_beta2 = 0.999
adam_weight_decay = 1e-2
adam_epsilon = 1e-08
max_grad_norm = 1
push_to_hub = False
hub_model_id = None
prediction_type = None
mixed_precision = "fp16"
report_to = "wandb" # tensorborad
local_rank = -1
### TODO: you can change it ###
checkpointing_steps = 500
checkpoints_total_limit = None
# "Whether training should be resumed from a previous checkpoint. Use a path saved by"
           # ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
resume_from_checkpoint = None
enable_xformers_memory_efficient_attention = False
noise_offset = 0
logging_dir = "logs"
# TODO: lora rank ; we need to run different experiments on that ###
rank = 4
seed = 1337
# DATASET_NAME_MAPPING = { #可能需要改 因为是不同的dataset了
#     "lambdalabs/pokemon-blip-captions": ("image", "text"),
# }
DATASET_NAME_MAPPING = { #可能需要改 因为是不同的dataset了
    train_data_dir: ("image", "text"),
}

In [10]:
set_seed(seed)
# Load scheduler, tokenizer and models.
# 将pretrained diffusion model 的各个部分都load进来
noise_scheduler = DDPMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
tokenizer = CLIPTokenizer.from_pretrained(
    pretrained_model_name_or_path, subfolder="tokenizer", revision=revision
)
text_encoder = CLIPTextModel.from_pretrained(
    pretrained_model_name_or_path, subfolder="text_encoder", revision=revision
)
vae = AutoencoderKL.from_pretrained(
    pretrained_model_name_or_path, subfolder="vae", revision=revision, variant=variant
)
unet = UNet2DConditionModel.from_pretrained(
    pretrained_model_name_or_path, subfolder="unet", revision=revision, variant=variant
)
# freeze parameters of models to save more memory
# 即我们不需要改Model原先的parameters,我们只需要改lora层的parameters
unet.requires_grad_(False)
vae.requires_grad_(False)
text_encoder.requires_grad_(False)


scheduler/scheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

text_encoder/config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

CLIPTextModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e

In [11]:
# 开始finetune!!! 调整的是unet中的weights!
# now we will add new LoRA weights to the attention layers
# It's important to realize here how many attention weights will be added and of which sizes
# The sizes of the attention layers consist only of two different variables:

# 1) - the "hidden_size", which is increased according to `unet.config.block_out_channels`.
# 2) - the "cross attention size", which is set to `unet.config.cross_attention_dim`.

# Let's first see how many attention processors we will have to set.
# For Stable Diffusion, it should be equal to:
# - down blocks (2x attention layers) * (2x transformer layers) * (3x down blocks) = 12
# - mid blocks (2x attention layers) * (1x transformer layers) * (1x mid blocks) = 2
# - up blocks (2x attention layers) * (3x transformer layers) * (3x down blocks) = 18
# => 32 layers

# Set correct lora layers
unet_lora_parameters = []
# attn_processor_name is the key
# attn_processor is the value
for attn_processor_name, attn_processor in unet.attn_processors.items():
    # Parse the attention module.
    attn_module = unet
    for n in attn_processor_name.split(".")[:-1]:
        attn_module = getattr(attn_module, n)

    # Set the `lora_layer` attribute of the attention-related matrices.
    attn_module.to_q.set_lora_layer(
        LoRALinearLayer(
            in_features=attn_module.to_q.in_features, out_features=attn_module.to_q.out_features, rank=rank
        )
    )
    attn_module.to_k.set_lora_layer(
        LoRALinearLayer(
            in_features=attn_module.to_k.in_features, out_features=attn_module.to_k.out_features, rank=rank
        )
    )

    attn_module.to_v.set_lora_layer(
        LoRALinearLayer(
            in_features=attn_module.to_v.in_features, out_features=attn_module.to_v.out_features, rank=rank
        )
    )
    attn_module.to_out[0].set_lora_layer(
        LoRALinearLayer(
            in_features=attn_module.to_out[0].in_features,
            out_features=attn_module.to_out[0].out_features,
            rank=rank,
        )
    )

    # Accumulate the LoRA params to optimize.
    unet_lora_parameters.extend(attn_module.to_q.lora_layer.parameters())
    unet_lora_parameters.extend(attn_module.to_k.lora_layer.parameters())
    unet_lora_parameters.extend(attn_module.to_v.lora_layer.parameters())
    unet_lora_parameters.extend(attn_module.to_out[0].lora_layer.parameters())

optimizer_cls = torch.optim.AdamW
optimizer = optimizer_cls(
    unet_lora_parameters,
    lr=learning_rate,
    betas=(adam_beta1,adam_beta2),
    weight_decay=adam_weight_decay,
    eps=adam_epsilon,
)

In [12]:
from datasets import Dataset
import pandas as pd
from PIL import Image
class ImageCaptionDataset(Dataset):
    def __init__(self, csv_file, image_folder, transform=None):
        self.df = pd.read_csv(csv_file)
        self.image_folder = image_folder
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = os.path.join(self.image_folder, self.df.iloc[idx, 0])
        image = Image.open(img_name).convert("RGB")

        if self.transform:
            image = self.transform(image)

        text = self.df.iloc[idx, 1]
        print(type(image))
        return {"image": image, "text": text}

In [14]:
def convert_to_hf_dataset(dataset):
    hf_dataset = Dataset.from_pandas(dataset.df)
    return hf_dataset

# Preprocessing the datasets.
# We need to tokenize input captions and transform the images.
def tokenize_captions(examples, is_train=True):
    captions = []
    for caption in examples[caption_column]:
        if isinstance(caption, str):
            captions.append(caption)
        elif isinstance(caption, (list, np.ndarray)):
            # take a random caption if there are multiple
            captions.append(random.choice(caption) if is_train else caption[0])
        else:
            raise ValueError(
                f"Caption column `{caption_column}` should contain either strings or lists of strings."
            )
    inputs = tokenizer(
        captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    return inputs.input_ids

def preprocess_train(examples):
    images = [Image.open(image_folder+image).convert("RGB") for image in examples[image_column]]
    examples["pixel_values"] = [train_transforms(image) for image in images]
    examples["input_ids"] = tokenize_captions(examples)
    return examples

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
    input_ids = torch.stack([example["input_ids"] for example in examples])
    return {"pixel_values": pixel_values, "input_ids": input_ids}

In [13]:
# construct train and validation dataset
# TODO: image_folder is the place where images stored; csv should be in the same dir with images
csv_path = "/content/drive/MyDrive/cs182-master/emoji_image/train/emoji_png.csv"
image_folder = "/content/drive/MyDrive/cs182-master/emoji_image/train/"
csv_path_valid = "/content/drive/MyDrive/cs182-master/emoji_image/validation/valid.csv"
image_folder_valid = "/content/drive/MyDrive/cs182-master/emoji_image/validation"
dataset = ImageCaptionDataset(csv_file=csv_path, image_folder=image_folder, transform=transforms.ToTensor())
valid_dataset = ImageCaptionDataset(csv_file=csv_path_valid, image_folder=image_folder_valid, transform=transforms.ToTensor())
dataset = convert_to_hf_dataset(dataset)
valid_dataset = convert_to_hf_dataset(valid_dataset)

# Preprocessing the datasets.
train_transforms = transforms.Compose(
    [
        transforms.Resize(resolution, interpolation=transforms.InterpolationMode.BILINEAR),
        transforms.CenterCrop(resolution) if center_crop else transforms.RandomCrop(resolution),
        transforms.RandomHorizontalFlip() if random_flip else transforms.Lambda(lambda x: x),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5]),
    ]
)
# Set the training transforms
train_dataset = dataset.with_transform(preprocess_train)
valid_dataset = valid_dataset.with_transform(preprocess_train)

In [42]:
# DataLoaders creation:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=collate_fn,
    batch_size=train_batch_size,
    num_workers=dataloader_num_workers,
)

valid_dataloader = torch.utils.data.DataLoader(
    valid_dataset,
    shuffle=False,
    collate_fn=collate_fn,
    batch_size=7,
    num_workers=dataloader_num_workers,
)

# set learning rate scheduler
lr_scheduler = get_scheduler(
    # lr_scheduler=lr_scheduler,
    "cosine",
    optimizer=optimizer,
    num_warmup_steps=lr_warmup_steps,
    num_training_steps=max_train_steps,
)


In [31]:
trainning_loss = []

In [None]:
wandb.init()

In [None]:
PROJECT_NAME = "diffusers-examples_text_to_image"
CHECKPOINT_PATH = "/content/drive/MyDrive/huggingface_diffuser/model/checkpoints/checkpoint.tar"
run = wandb.init(project=PROJECT_NAME, resume=True)
if wandb.run.resumed:
    checkpoint = torch.load(wandb.restore(CHECKPOINT_PATH))
    unet_path = checkpoint["unet"]
    unet.load_attn_procs(unet_path)
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    epoch = checkpoint["epoch"]
    loss = checkpoint["loss"]
    batch = checkpoint["batch"]



In [None]:
# Train!
# total_batch_size = train_batch_size * accelerator.num_processes * gradient_accumulation_steps
total_batch_size = train_batch_size * gradient_accumulation_steps

max_train_steps = 10000

progress_bar = tqdm(
    range(0, max_train_steps),
    initial=0,
    desc="Steps",
    # Only show the progress bar once on each machine.
    #disable=not accelerator.is_local_main_process,
)

# Load previous pipeline
pipeline = DiffusionPipeline.from_pretrained(
    pretrained_model_name_or_path,
    safety_checker=None
).to("cuda")
max_epoch = 10 #5
for epoch in range(max_epoch):
    unet.train()
    for step, batch in enumerate(train_dataloader):
      # Convert images to latent space
      latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
      latents = latents * vae.config.scaling_factor
      # Sample noise that we'll add to the latents
      noise = torch.randn_like(latents)
      bsz = latents.shape[0]
      # Sample a random timestep for each image
      timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
      timesteps = timesteps.long()

      # Add noise to the latents according to the noise magnitude at each timestep
      # (this is the forward diffusion process)
      noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
      # Get the text embedding for conditioning
      encoder_hidden_states = text_encoder(batch["input_ids"])[0]

      # Get the target for loss depending on the prediction type
      if prediction_type is not None:
          # set prediction_type of scheduler if defined
          noise_scheduler.register_to_config(prediction_type=prediction_type)

      if noise_scheduler.config.prediction_type == "epsilon":
          target = noise
      elif noise_scheduler.config.prediction_type == "v_prediction":
          target = noise_scheduler.get_velocity(latents, noise, timesteps)
      else:
          raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")

      # draw validation loss
      if step % 19 == 1:
        unet.eval()
        for _ , valid_batch in enumerate(valid_dataloader):
          # Convert images to latent space
          valid_latents = vae.encode(valid_batch["pixel_values"]).latent_dist.sample()
          valid_latents = valid_latents * vae.config.scaling_factor
          valid_noise = torch.randn_like(valid_latents)
          valid_bsz = valid_latents.shape[0]
          # Sample a random timestep for each image
          valid_timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (valid_bsz,), device=valid_latents.device)
          valid_timesteps = timesteps.long()

          # Add noise to the latents according to the noise magnitude at each timestep
          # (this is the forward diffusion process)
          valid_noisy_latents = noise_scheduler.add_noise(valid_latents, valid_noise, valid_timesteps)
          # Get the text embedding for conditioning
          valid_encoder_hidden_states = text_encoder(valid_batch["input_ids"])[0]

          # Get the target for loss depending on the prediction type
          if prediction_type is not None:
              # set prediction_type of scheduler if defined
              noise_scheduler.register_to_config(prediction_type=prediction_type)

          if noise_scheduler.config.prediction_type == "epsilon":
              valid_target = valid_noise
          elif noise_scheduler.config.prediction_type == "v_prediction":
              valid_target = noise_scheduler.get_velocity(valid_latents, valid_noise, valid_timesteps)
          else:
              raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")

          valid_model_pred = unet(valid_noisy_latents, valid_timesteps, valid_encoder_hidden_states).sample
          valid_loss = F.mse_loss(valid_model_pred.float(), valid_target.float(), reduction="mean")
          print("valid_loss = "+str(valid_loss.item()))
          wandb.log({"valid_loss": valid_loss.item()})

      unet.train()
      # Predict the noise residual and compute loss
      model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
      loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
      #print(target.float())
      #print(model_pred.float())
      # Backpropagate
      loss.backward()
      trainning_loss.append(loss.item())
      optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()
      progress_bar.update(1)
      print("loss = "+str(loss.item()))
      wandb.log({"loss": loss})
      if step % 19 == 1: #修改！！！
        unet.save_attn_procs(output_dir+f"epoch_{epoch+1}_batch_{step+1}")
        unet.save_attn_procs(output_dir+f"latest")
        # Save our checkpoint loc
        torch.save(
            {
                "epoch": epoch,
                "unet": output_dir+f"latest",
                "optimizer_state_dict": optimizer.state_dict(),
                "loss": loss,
                "batch": step,
            },
            CHECKPOINT_PATH,
        )
        wandb.save(CHECKPOINT_PATH)
        # load attention processors
        pipeline.unet.load_attn_procs(output_dir+f"latest").to("cuda")

        # run inference
        generator = torch.Generator()
        if seed is not None:
            generator = generator.manual_seed(seed)
        images = []
        for i in range(num_validation_images):
            images.append(pipeline(validation_prompt[i], num_inference_steps=30, generator=generator).images[0])
        wandb.log({"validation": [wandb.Image(image, caption=f"{i}: {validation_prompt[i]}")for i, image in enumerate(images)]})

    # # Load previous pipeline
    # pipeline = DiffusionPipeline.from_pretrained(
    #     pretrained_model_name_or_path
    # )
    # # load attention processors
    # pipeline.unet.load_attn_procs(output_dir+f"latest")

    # # run inference
    # generator = torch.Generator()
    # if seed is not None:
    #     generator = generator.manual_seed(seed)
    # images = []
    # for _ in range(num_validation_images):
    #     images.append(pipeline(validation_prompt, num_inference_steps=30, generator=generator).images[0])
    # wandb.log({"validation": [wandb.Image(image, caption=f"{i}: {validation_prompt}")for i, image in enumerate(images)]})


unet = unet.to(torch.float32)
unet.save_attn_procs(output_dir)


# # Final inference
# # Load previous pipeline
# pipeline = DiffusionPipeline.from_pretrained(
#     pretrained_model_name_or_path,
#     safety_checker=None
# )
# # load attention processors
# pipeline.unet.load_attn_procs(output_dir)

# # run inference
# generator = torch.Generator()
# if seed is not None:
#     generator = generator.manual_seed(seed)
# images = []

# for _ in range(num_validation_images):
#   # 如果这里要改为多个prompt 后面可能就不能加[0]?
#     images.append(pipeline(validation_prompt, num_inference_steps=30, generator=generator).images[0])
# wandb.log({"validation": [wandb.Image(image, caption=f"{i}: {validation_prompt}")for i, image in enumerate(images)]})

# unet = unet.to(torch.float32)
#       unet.save_attn_procs(output_dir)
#       # Final inference
#       # Load previous pipeline
#       pipeline = DiffusionPipeline.from_pretrained(
#           pretrained_model_name_or_path, revision=revision, variant=variant
#       )
#       #torch_dtype=weight_dtype
#       #pipeline = pipeline.to(accelerator.device)
#       # load attention processors
#       pipeline.unet.load_attn_procs(output_dir)

#       # run inference
#       generator = torch.Generator()
#       if seed is not None:
#           generator = generator.manual_seed(seed)
#       images = []
#       for _ in range(num_validation_images):
#           images.append(pipeline(validation_prompt, num_inference_steps=30, generator=generator).images[0])
#       wandb.log({"validation": [wandb.Image(image, caption=f"{i}: {validation_prompt}")for i, image in enumerate(images)]})
# if len(images) != 0:
#   np_images = np.stack([np.asarray(img) for img in images])
#   writer.add_images("test", np_images, dataformats="NHWC")


Steps:   0%|          | 0/10000 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


loss = 0.040078699588775635
loss = 0.004209933802485466


  0%|          | 0/30 [00:00<?, ?it/s]

loss = 0.07573135197162628
loss = 0.024343110620975494
loss = 0.05296977981925011
loss = 0.053161315619945526
loss = 0.02039983682334423




loss = 0.0049379728734493256
loss = 0.05967239290475845
loss = 0.059050045907497406
loss = 0.03454756364226341
loss = 0.03686803579330444
loss = 0.10082166641950607
loss = 0.05777546018362045
loss = 0.07134773582220078
loss = 0.047360509634017944
loss = 0.03297625482082367
loss = 0.03826602175831795
loss = 0.04847978800535202
loss = 0.07662438601255417
loss = 0.011110270395874977


  0%|          | 0/30 [00:00<?, ?it/s]