In [None]:
!pip install diffusers["torch"] transformers accelerate
!pip install git+https://github.com/huggingface/diffusers
!pip install einops
!pip install bayesian-optimization

Collecting diffusers[torch]
  Downloading diffusers-0.28.0-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.4->diffusers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.4->diffusers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.4->diffusers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.4->diffusers[torch])
  Using cac

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import shutil
import os
if os.path.isdir('Cityscape'):
  shutil.rmtree('Cityscape')
if os.path.isdir('Kvasir-SEG'):
  shutil.rmtree('Kvasir-SEG')
if os.path.isdir('VOC2012'):
  shutil.rmtree('VOC2012')
if os.path.isdir('Vaihingen'):
  shutil.rmtree('Vaihingen')
!scp drive/MyDrive/dataset/VOC2012.zip ./
!scp drive/MyDrive/dataset/Cityscape.zip ./
!scp drive/MyDrive/dataset/Kvasir-SEG.zip ./
!scp drive/MyDrive/dataset/Vaihingen.zip ./

!unzip VOC2012.zip
!unzip Cityscape.zip
!unzip Kvasir-SEG.zip
!unzip Vaihingen.zip

Archive:  VOC2012.zip
   creating: VOC2012/
   creating: VOC2012/images/
  inflating: VOC2012/images/311.png  
  inflating: VOC2012/images/28.png   
  inflating: VOC2012/images/568.png  
  inflating: VOC2012/images/701.png  
  inflating: VOC2012/images/186.png  
  inflating: VOC2012/images/293.png  
  inflating: VOC2012/images/206.png  
  inflating: VOC2012/images/146.png  
  inflating: VOC2012/images/819.png  
  inflating: VOC2012/images/124.png  
  inflating: VOC2012/images/392.png  
  inflating: VOC2012/images/130.png  
  inflating: VOC2012/images/339.png  
  inflating: VOC2012/images/10.png   
  inflating: VOC2012/images/88.png   
  inflating: VOC2012/images/66.png   
  inflating: VOC2012/images/299.png  
  inflating: VOC2012/images/61.png   
  inflating: VOC2012/images/336.png  
  inflating: VOC2012/images/533.png  
  inflating: VOC2012/images/141.png  
  inflating: VOC2012/images/690.png  
  inflating: VOC2012/images/305.png  
  inflating: VOC2012/images/348.png  
  inflating: VO

In [None]:
from typing import Optional, Union, Tuple, List, Callable, Dict
import torch
from diffusers import StableDiffusionXLPipeline, DDIMScheduler, AutoencoderKL, DiffusionPipeline
import numpy as np
import abc

LOW_RESOURCE = False
NUM_DIFFUSION_STEPS = 50
GUIDANCE_SCALE = 7.5
MAX_NUM_WORDS = 77

# code for store attention
class AttentionControl(abc.ABC):

    def step_callback(self, x_t):
        return x_t

    def between_steps(self):
        return

    @property
    def num_uncond_att_layers(self):
        return self.num_att_layers if LOW_RESOURCE else 0

    @abc.abstractmethod
    def forward (self, attn, is_cross: bool, place_in_unet: str):
        raise NotImplementedError

    def __call__(self, attn, is_cross: bool, place_in_unet: str):
        if self.cur_att_layer >= self.num_uncond_att_layers:
            if LOW_RESOURCE:
                attn = self.forward(attn, is_cross, place_in_unet)
            else:
                h = attn.shape[0]
                attn[h // 2:] = self.forward(attn[h // 2:], is_cross, place_in_unet)
        self.cur_att_layer += 1
        if self.cur_att_layer == self.num_att_layers + self.num_uncond_att_layers:
            self.cur_att_layer = 0
            self.cur_step += 1
            self.between_steps()
        return attn

    def reset(self):
        self.cur_step = 0
        self.cur_att_layer = 0

    def __init__(self):
        self.cur_step = 0
        self.num_att_layers = -1
        self.cur_att_layer = 0

class EmptyControl(AttentionControl):

    def forward (self, attn, is_cross: bool, place_in_unet: str):
        return attn


class AttentionStore(AttentionControl):

    @staticmethod
    def get_empty_store():
        return {"down_cross": [], "mid_cross": [], "up_cross": [],
                "down_self": [],  "mid_self": [],  "up_self": []}

    def forward(self, attn, is_cross: bool, place_in_unet: str):
        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
        if attn.shape[1] <= 64 ** 2:  # avoid memory overhead
            self.step_store[key].append(attn)
        return attn

    def between_steps(self):
        if len(self.attention_store) == 0:
            self.attention_store = self.step_store
        else:
            for key in self.attention_store:
                for i in range(len(self.attention_store[key])):
                    self.attention_store[key][i] += self.step_store[key][i]
        self.step_store = self.get_empty_store()

    def get_average_attention(self):
        average_attention = {key: [item / self.cur_step for item in self.attention_store[key]] for key in self.attention_store}
        return average_attention

    def reset(self):
        super(AttentionStore, self).reset()
        self.step_store = self.get_empty_store()
        self.attention_store = {}

    def __init__(self):
        super(AttentionStore, self).__init__()
        self.step_store = self.get_empty_store()
        self.attention_store = {}

  deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)


In [None]:
from PIL import Image
import cv2

## Visualization code utils
def view_images(images, num_rows=1, offset_ratio=0.02):
    if type(images) is list:
        num_empty = len(images) % num_rows
    elif images.ndim == 4:
        num_empty = images.shape[0] % num_rows
    else:
        images = [images]
        num_empty = 0

    empty_images = np.ones(images[0].shape, dtype=np.uint8) * 255
    images = [image.astype(np.uint8) for image in images] + [empty_images] * num_empty
    num_items = len(images)

    h, w, c = images[0].shape
    offset = int(h * offset_ratio)
    num_cols = num_items // num_rows
    image_ = np.ones((h * num_rows + offset * (num_rows - 1),
                      w * num_cols + offset * (num_cols - 1), 3), dtype=np.uint8) * 255
    for i in range(num_rows):
        for j in range(num_cols):
            image_[i * (h + offset): i * (h + offset) + h:, j * (w + offset): j * (w + offset) + w] = images[
                i * num_cols + j]

    pil_img = Image.fromarray(image_)
    display(pil_img)


def text_under_image(image: np.ndarray, text: str, text_color: Tuple[int, int, int] = (0, 0, 0)):
    h, w, c = image.shape
    offset = int(h * .2)
    img = np.ones((h + offset, w, c), dtype=np.uint8) * 255
    font = cv2.FONT_HERSHEY_SIMPLEX
    # font = ImageFont.truetype("/usr/share/fonts/truetype/noto/NotoMono-Regular.ttf", font_size)
    img[:h] = image
    textsize = cv2.getTextSize(text, font, 1, 2)[0]
    text_x, text_y = (w - textsize[0]) // 2, h + offset - textsize[1] // 2
    cv2.putText(img, text, (text_x, text_y ), font, 1, text_color, 2)
    return img

In [None]:
# code for aggregaring attention
@torch.no_grad()
def aggregate_all_attention(prompts, attention_store: AttentionStore, from_where: List[str], is_cross: bool, select: int):
    attention_maps = attention_store.get_average_attention()
    att_8 = []
    att_16 = []
    att_32 = []
    att_64 = []
    for location in from_where:
        for item in attention_maps[f"{location}_{'cross' if is_cross else 'self'}"]:
            if item.shape[1] == 8*8:
                cross_maps = item.reshape(len(prompts), -1, 8, 8, item.shape[-1])[select]
                att_8.append(cross_maps)
            if item.shape[1] == 16*16:
                cross_maps = item.reshape(len(prompts), -1, 16, 16, item.shape[-1])[select]
                att_16.append(cross_maps)
            if item.shape[1] == 32*32:
                cross_maps = item.reshape(len(prompts), -1, 32, 32, item.shape[-1])[select]
                att_32.append(cross_maps)
            if item.shape[1] == 64*64:
                cross_maps = item.reshape(len(prompts), -1, 64, 64, item.shape[-1])[select]
                att_64.append(cross_maps)

    # print(len(att_8), len(att_16), len(att_32), len(att_64)) # base and turbo: 0, 60, 10, 0, refiner: 4, 20, 20, 0
    atts = []
    for att in [att_8, att_16, att_32, att_64]:
        if len(att) == 0:
          continue
        att = torch.cat(att, dim=0)
        att = att.sum(0) / att.shape[0]
        atts.append(att.cpu())
    return atts


@torch.no_grad()
def aggregate_attention(prompts, attention_store: AttentionStore, res: int, from_where: List[str], is_cross: bool, select: int):
    out = []
    attention_maps = attention_store.get_average_attention()
    num_pixels = res ** 2
    for location in from_where:
        for item in attention_maps[f"{location}_{'cross' if is_cross else 'self'}"]:
            if item.shape[1] == num_pixels:
                cross_maps = item.reshape(len(prompts), -1, res, res, item.shape[-1])[select]
                out.append(cross_maps)
    out = torch.cat(out, dim=0)
    out = out.sum(0) / out.shape[0]
    return out.cpu()


# visualize cross att
def show_cross_attention(prompts, tokenizer, attention_store: AttentionStore, res: int, from_where: List[str], select: int = 0):
    tokens = tokenizer.encode(prompts[select])
    decoder = tokenizer.decode
    attention_maps = aggregate_attention(prompts, attention_store, res, from_where, True, select)
    images = []
    j = 0
    for i in range(len(tokens)):
        image = attention_maps[:, :, i]
        image = 255 * image / image.max()
        image = image.unsqueeze(-1).expand(*image.shape, 3)
        image = image.float().numpy().astype(np.uint8)
        image = np.array(Image.fromarray(image).resize((256, 256)))
        if decoder(int(tokens[j])) == "++":
            j += 1
        image = text_under_image(image, decoder(int(tokens[j])))
        images.append(image)
        j+=1
        if j >= len(tokens):
            break
    view_images(np.stack(images, axis=0))


# visualize self att
def show_self_attention_comp(prompts, attention_store: AttentionStore, res: int, from_where: List[str],
                        max_com=10, select: int = 0):
    attention_maps = aggregate_attention(prompts, attention_store, res, from_where, False, select).float().numpy().reshape((res ** 2, res ** 2))
    u, s, vh = np.linalg.svd(attention_maps - np.mean(attention_maps, axis=1, keepdims=True))
    images = []
    for i in range(max_com):
        image = vh[i].reshape(res, res)
        image = image - image.min()
        image = 255 * image / image.max()
        image = np.repeat(np.expand_dims(image, axis=2), 3, axis=2).astype(np.uint8)
        image = Image.fromarray(image).resize((256, 256))
        image = np.array(image)
        images.append(image)
    view_images(np.concatenate(images, axis=1))

In [None]:
def encode_imgs(imgs, vae):
    # imgs: [B, 3, H, W]
    imgs = 2 * imgs - 1
    tmp = vae.encode(imgs)
    posterior = vae.encode(imgs).latent_dist.mean
    latents = posterior * 0.18215
    return latents


# fix random seed
def same_seeds(seed):
    torch.manual_seed(seed)  # 固定随机种子（CPU）
    if torch.cuda.is_available():  # 固定随机种子（GPU)
        torch.cuda.manual_seed(seed)  # 为当前GPU设置
        torch.cuda.manual_seed_all(seed)  # 为所有GPU设置
    np.random.seed(seed)  # 保证后续使用random函数时，产生固定的随机数
    torch.backends.cudnn.benchmark = False  # GPU、网络结构固定，可设置为True
    torch.backends.cudnn.deterministic = True  # 固定网络结构


# cam visual_code
def show_cam_on_image(img, mask):
    mask = F.interpolate(mask.unsqueeze(0).unsqueeze(0), size=(img.size[1],img.size[0]), mode='bilinear', align_corners=False).squeeze().squeeze()
    img = np.float32(img) / 255.
    heatmap = cv2.applyColorMap(np.uint8(255 * mask), cv2.COLORMAP_JET)
    heatmap = np.float32(heatmap) / 255
    cam = heatmap + img
    cam = cam / np.max(cam)
    cam = np.uint8(255 * cam)
    return cam

In [None]:
## ptp utils function
from einops import rearrange

def init_latent(latent, model, height, width, generator, batch_size):
    if latent is None:
        print('enter here')
        latent = torch.randn(
            (1, model.unet.config.in_channels, height // 8, width // 8),
            generator=generator,
        )
    latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)
    return latent, latents

@torch.no_grad()
def register_attention_control(model, controller):
    def ca_forward(self, place_in_unet):
        to_out = self.to_out
        if type(to_out) is torch.nn.modules.container.ModuleList:
            to_out = self.to_out[0]
        else:
            to_out = self.to_out

        def forward(hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
            # hidden_states is nan (becasue the vae of SDXL does not support f16 precision)
            x = hidden_states
            context = encoder_hidden_states
            mask = attention_mask
            batch_size = len(x)
            h = self.heads
            # x torch.Size([2, 1024, 640])
            # to_q weight torch.Size([640, 640])
            # k torch.Size([2, 1024, 640])
            # v torch.Size([2, 1024, 640])
            # print('x', x.shape) # torch.Size([2, 1024, 640])
            # print('to_q weight', self.to_q.weight.shape) # torch.Size([640, 640])
            q = self.to_q(x)
            # print('q', q.shape)
            is_cross = context is not None
            context = context if is_cross else x
            k = self.to_k(context)
            # print('k', k.shape) # torch.Size([2, 1024, 640])
            v = self.to_v(context)
            # print('v', v.shape) # torch.Size([2, 1024, 640])
            q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
            sim = torch.einsum("b i d, b j d -> b i j", q, k) * self.scale

            if mask is not None:
                mask = mask.reshape(batch_size, -1)
                max_neg_value = -torch.finfo(sim.dtype).max
                mask = mask[:, None, :].repeat(h, 1, 1)
                sim.masked_fill_(~mask, max_neg_value)

            # attention, what we cannot get enough of
            attn = sim.softmax(dim=-1)
            ## controller foward function saving the attention map in self.step_store
            attn = controller(attn, is_cross, place_in_unet)

            out = torch.einsum("b i j, b j d -> b i d", attn, v)
            out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
            return to_out(out)

        return forward

    class DummyController:

        def __call__(self, *args):
            return args[0]

        def __init__(self):
            self.num_att_layers = 0

    if controller is None:
        controller = DummyController()

    def register_recr(net_, count, place_in_unet):
        if net_.__class__.__name__ == 'Attention':
            net_.forward = ca_forward(net_, place_in_unet)
            return count + 1
        elif hasattr(net_, 'children'):
            for net__ in net_.children():
                count = register_recr(net__, count, place_in_unet)
        return count

    cross_att_count = 0
    sub_nets = model.unet.named_children()
    for net in sub_nets:
        if "down" in net[0]:
            cross_att_count += register_recr(net[1], 0, "down")
        elif "up" in net[0]:
            cross_att_count += register_recr(net[1], 0, "up")
        elif "mid" in net[0]:
            cross_att_count += register_recr(net[1], 0, "mid")

    controller.num_att_layers = cross_att_count


@torch.no_grad()
def diffusion_step(model, controller, latents, context, t, guidance_scale, low_resource=False, height=None, width=None, base=True):
    if low_resource:
        noise_pred_uncond = model.unet(latents, t, encoder_hidden_states=context[0])["sample"]
        noise_prediction_text = model.unet(latents, t, encoder_hidden_states=context[1])["sample"]
    else:
        # 7. Prepare added time ids & embeddings
        # context = [prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds]

        (
            prompt_embeds,
            negative_prompt_embeds,
            pooled_prompt_embeds,
            negative_pooled_prompt_embeds
        ) = context

        add_text_embeds = pooled_prompt_embeds
        original_size = (height, width)
        crops_coords_top_left = (0, 0)
        target_size = (height, width)
        if base:
          add_time_ids = model._get_add_time_ids(
            original_size,
            crops_coords_top_left,
            target_size,
            dtype=context[0].dtype,
            text_encoder_projection_dim=model.text_encoder_2.config.projection_dim, # 1280
          )
          negative_add_time_ids = add_time_ids
        else:
          add_time_ids, add_neg_time_ids = model._get_add_time_ids(
            original_size,
            crops_coords_top_left,
            target_size,
            6.0, # aesthetic_score default
            2.5, # negative_aesthetic_score default
            original_size, # negative_original_size
            crops_coords_top_left, # negative_crops_coords_top_left,
            target_size, # negative_target_size,
            dtype=context[0].dtype,
            text_encoder_projection_dim=model.text_encoder_2.config.projection_dim,
          )

        # do_classifier_free_guidance
        prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
        add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
        if base:
          add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
        else:
          add_neg_time_ids = add_neg_time_ids
          add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)

        prompt_embeds = prompt_embeds.to(model.device)
        add_text_embeds = add_text_embeds.to(model.device)
        add_time_ids = add_time_ids.to(model.device)

        latents_input = torch.cat([latents] * 2)
        added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}

        noise_pred = model.unet(
            latents_input,
            t,
            encoder_hidden_states=prompt_embeds,
            timestep_cond = None,
            added_cond_kwargs=added_cond_kwargs
        )["sample"]

    return noise_pred


## text to image custom pipeline
@torch.no_grad()
def text2image_ldm_stable(
    model,
    prompt: List[str],
    controller,
    num_inference_steps: int = 50,
    guidance_scale: float = 7.5,
    generator: Optional[torch.Generator] = None,
    latent: Optional[torch.FloatTensor] = None,
    low_resource: bool = False,
    noise_sample_num=1,
    height: int = 512,
    width: int = 512,
    base: bool = True
):
    # 1. check input:
    #    height and width must be divisible by 8
    #    prompt_embeds and negative prompt_embeds must have the same shape
    #    pooled_prompt_embed and prompt_embeds are generated with the same text encoder
    #    negative_pooled_prompt_embed and negative_prompt_embeds are generated with the same text encoder
    # 2. Define call parameters
    register_attention_control(model, controller)
    height = height or model.default_sample_size * model.vae_scale_factor
    width = width or model.default_sample_size * model.vae_scale_factor
    batch_size = len(prompt)

    # 3. Encode input prompt (refereence: encode_prompt function)

    tokenizers = [model.tokenizer, model.tokenizer_2] if model.tokenizer is not None else [model.tokenizer_2]
    text_encoders = [model.text_encoder, model.text_encoder_2] if model.text_encoder is not None else [model.text_encoder_2]
    prompts = [prompt, prompt]
    prompt_embeds_list = []
    max_length = model.tokenizer.model_max_length if model.tokenizer is not None else model.tokenizer_2.model_max_length
    for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
      text_inputs = tokenizer(
        prompt,
        padding="max_length",
        max_length=max_length,
        truncation=True,
        return_tensors="pt",
      )

      text_input_ids = text_inputs.input_ids
      untruncated_ids = tokenizer(prompt, padding='longest', return_tensors="pt").input_ids

      prompt_embeds = text_encoder(text_input_ids.to(model.device), output_hidden_states=True)
      pooled_prompt_embeds = prompt_embeds[0]
      # clip_skip is None
      prompt_embeds = prompt_embeds.hidden_states[-2]

      prompt_embeds_list.append(prompt_embeds)

    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)

    negative_prompt = [""] * batch_size
    uncond_tokens = [negative_prompt, negative_prompt]
    negative_prompt_embeds_list = []

    for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
      max_length = prompt_embeds.shape[1]
      uncond_input = tokenizer(
          negative_prompt,
          padding="max_length",
          max_length=max_length,
          truncation=True,
          return_tensors="pt"
      )

      negative_prompt_embeds = text_encoder(
          uncond_input.input_ids.to(model.device),
          output_hidden_states=True
      )

      negative_pooled_prompt_embeds = negative_prompt_embeds[0]
      negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]

      negative_prompt_embeds_list.append(negative_prompt_embeds)

    negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)

    prompt_embeds = prompt_embeds.to(dtype=model.text_encoder_2.dtype, device=model.device)
    negative_prompt_embeds = negative_prompt_embeds.to(dtype=model.text_encoder_2.dtype, device=model.device)

    context = [prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds]

    # 4. Prepare timesteps
    model.scheduler.set_timesteps(num_inference_steps)
    # 5. Prepare latent variables
    latent, latents = init_latent(latent, model, height, width, generator, batch_size)
    latents = latent.squeeze(1).to(model.device)

    latents = diffusion_step(model, controller, latents, context, num_inference_steps, guidance_scale, low_resource, height, width, base)

    return None, None

In [None]:
import torch.nn.functional as F

# main function
# Stable diffusion
def generate_att(t, ldm_stable, input_latent, noise, prompts, controller, pos_positions, device,
                 is_self=True, is_multi_self=False, is_cross_norm=True, weight=[0.8, 0.2], height=None,
                 width=None, verbose=False, base=True, neg_positions=[], neg_weight=1.0, alpha=8, beta=0.4):
    ## pos: position of the target class word int he prompt
    controller.reset()
    g_cpu = torch.Generator(4307)
    t = int(t)
    latents_noisy = ldm_stable.scheduler.add_noise(input_latent, noise, torch.tensor(t, device=device))
    images, x_t = text2image_ldm_stable(ldm_stable, prompts, controller, latent=latents_noisy, num_inference_steps=t, guidance_scale=GUIDANCE_SCALE, generator=g_cpu, low_resource=LOW_RESOURCE, height=height, width=width, base=base)
    layers = ("mid", "up", "down")
#     cross attention:  torch.Size([16, 16, 77])
#     cross attention:  torch.Size([32, 32, 77])
    cross_attention_maps = aggregate_all_attention(prompts, controller, layers, True, 0)
#     self attention:  torch.Size([16, 16, 256])
#     self attention:  torch.Size([32, 32, 1024])

#     self attention: torch.Size([8, 8, 64]) refiner
#     self attention: torch.Size([16, 16, 256]) refiner
#     self attention: torch.Size([32, 32, 1024]) refiner
    self_attention_maps = aggregate_all_attention(prompts, controller, ("up", "mid", "down"), False, 0)

    imgs = []
    ## res: resolution
    resolution_range = [16, 32] if base else [8, 16, 32]

    for idx, res in enumerate(resolution_range):
        out_att = cross_attention_maps[idx].permute(2,0,1).float()
        if is_cross_norm:
            att_max = torch.amax(out_att, dim=(1,2), keepdim=True)
            att_min = torch.amin(out_att, dim=(1,2), keepdim=True)
            out_att = (out_att - att_min) / (att_max - att_min)
        if is_multi_self:
            self_att = self_attention_maps[idx].view(res * res, res * res).float()
            self_att = self_att / self_att.max()
            out_att = torch.matmul(self_att.unsqueeze(0),out_att.view(-1,res*res,1)).view(-1,res,res)
        if res != resolution_range[-1]:
            out_att = F.interpolate(out_att.unsqueeze(0), size=(resolution_range[-1], resolution_range[-1]), mode='bilinear', align_corners=False).squeeze()
        ## 16*16: 0.8, 32*32: 0.2
        imgs.append(out_att * weight[idx])

    # aggregated cross attention map
    cross_att_map = torch.stack(imgs).sum(0)[pos_positions[0]].mean(0).view(resolution_range[-1]*resolution_range[-1], 1)
    for pos in pos_positions[1:]:
      cross_att_map += torch.stack(imgs).sum(0)[pos].mean(0).view(resolution_range[-1]*resolution_range[-1], 1)
    if len(pos_positions) > 1:
      cross_att_map /= len(pos_positions)

    if len(neg_positions) > 0:
      cross_att_map_neg = torch.zeros_like(cross_att_map)
      for pos in neg_positions:
        cross_att_map_neg += torch.stack(imgs).sum(0)[pos].mean(0).view(resolution_range[-1]*resolution_range[-1], 1)
      cross_att_map_neg /= len(neg_positions)
      cross_att_map -= neg_weight * cross_att_map_neg

    # refine cross attention map with self attention map
    if is_self and not is_multi_self:
        self_att = self_attention_maps[-1].view(resolution_range[-1]*resolution_range[-1],resolution_range[-1]*resolution_range[-1]).float()
        self_att = self_att / self_att.max()
        for i in range(1):
            cross_att_map = torch.matmul(self_att, cross_att_map)
    # res here is the highest resulution iterated in previous for loop, 64
    att_map = cross_att_map.view(res, res)
    att_map = F.interpolate(att_map.unsqueeze(0).unsqueeze(0), size=(512,512), mode='bilinear', align_corners=False).squeeze().squeeze()
    att_map = (att_map - att_map.min()) / (att_map.max() - att_map.min())
    att_map = F.sigmoid(alpha * (att_map - beta))
    att_map = (att_map - att_map.min()) / (att_map.max() - att_map.min())
    if verbose:
      att_map_map = Image.fromarray((att_map.cpu().detach().numpy() * 255).astype(np.uint8),mode="L")
      display(att_map_map)
      tokenizer = ldm_stable.tokenizer if ldm_stable.tokenizer is not None else ldm_stable.tokenizer_2
      for res in resolution_range:
        print("{}x{} cross att map".format(res, res))
        show_cross_attention(prompts, tokenizer, controller, res=res, from_where=layers)

    return att_map

In [None]:
from torchvision import transforms

def stable_diffusion_inference(img_path, cls_name, device, blip_device, processor, model, ldm_stable, verbose=False,
                               weight=[0.8, 0.2], t=100, base=True, prompt=None, seed=3407, negative_token=False,
                               neg_weight=1.0, alpha=8, beta=0.4):
  ## img_path: path to the target image
  ## cls name: taget class in the prompt
  ## device: device of stable diffusion model
  ## blip device: device of BLIP model
  ## processor: BLIP processot
  ## model: BLIP model
  ## vae: vae of the stable diffusion model
  with torch.no_grad():
    same_seeds(seed)

    input_img = Image.open(img_path).convert("RGB")

    trans = []
    trans.append(transforms.ToTensor())
    trans = transforms.Compose(trans)

    img_tensor = (trans(input_img).unsqueeze(0)).to(device)
    rgb_512 = F.interpolate(img_tensor, (512, 512), mode='bilinear', align_corners=False).bfloat16()

    vae = ldm_stable.vae
    input_latent = encode_imgs(rgb_512, vae)
    # print('input latent', input_latent)
    noise = torch.randn_like(input_latent).to(device)
    raw_image = input_img
    if prompt is None:
      text = f"a photograph of {cls_name}"
      inputs = processor(raw_image,text,return_tensors="pt").to(blip_device) # processor: Blip processor

      # use blip and "++" emphasizing semantic information of target categories
      out = model.generate(**inputs)
      texts = processor.decode(out[0], skip_special_tokens=True)
      texts = text +"++"+ texts[len(text):] # ", highly realistic, artsy, trending, colorful"
    else:
      texts = prompt

    # weight is the weight of different layer's cross attn
    # pos is the position of target class word in the sentence, in "a photograph of plane" (plane)'s position is 4
    # t is the denoising step, usually set between 50 to 150
    prompts = [texts]
    # print("**** blip_prompt: "+texts+"****")
    token_ids = ldm_stable.tokenizer.encode(texts)
    tokens = [ldm_stable.tokenizer.decode(int(_)) for _ in token_ids]
    tagged_tokens = nltk.tag.pos_tag(tokens)
    pos_positions = []   # pos of targer class word
    neg_positions = []
    pos_start = False
    neg_start_pos = -1
    for i, (word, tag) in enumerate(tagged_tokens):
      if word == 'of':
        pos_start = True
        continue
      if word == '++':
        neg_start_pos = i + 1
        break
      if pos_start:
        pos_positions.append([i])
    if negative_token:
      for i, (word, tag) in enumerate(tagged_tokens[neg_start_pos:-1]):
        if tag.startswith('N'):
          if word != cls_name:
            # print(word)
            neg_positions.append([i + neg_start_pos - 1])
          # else:
          #   pos_positions.append([i + neg_start_pos - 1])
    # print(pos_positions)
    # print(neg_positions)
    controller = AttentionStore()
    height = 512
    width = 512
    mask = generate_att(t, ldm_stable, input_latent, noise, prompts, controller, pos_positions, device,
                        is_self=True, is_multi_self=False, is_cross_norm=True, weight=weight, height=height,
                        width=width, verbose=verbose, base=base, neg_positions=neg_positions,
                        neg_weight=neg_weight, alpha=alpha, beta=beta)
    mask = F.interpolate(mask.unsqueeze(0).unsqueeze(0), size=(raw_image.size[1],raw_image.size[0]), mode='bilinear', align_corners=False).squeeze().squeeze()
    # print(mask.shape, raw_image.size)
    if verbose:
      cam = show_cam_on_image(raw_image, mask)
      print("visual_cam")
      display(Image.fromarray(cam[:,:,::-1]))
    del img_tensor
    del rgb_512
    del noise
    if prompt is None:
      del inputs
    torch.cuda.empty_cache()
    return mask, texts

In [None]:
import os
import shutil
import time

import json
import time, os, shutil

def domain_test(processor, model, ldm_stable, refiner, blip_device, device, images_dir, result_dir, label_map,
                augmented_label_file, base_weight, refiner_weight, base_t, refiner_t, alpha, augmented_label=False,
                thres_list=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], seed=3407, negative_token=False,
                neg_weight=1.0, a=8, b=0.4):
  start = time.time()
  if os.path.isdir(result_dir):
    shutil.rmtree(result_dir)
  os.mkdir(result_dir)
  os.mkdir(os.path.join(result_dir, 'mask'))
  for thres in thres_list:
    os.mkdir(os.path.join(result_dir, '{}'.format(thres)))
  cls_arr_dir = images_dir.replace("images", "class_array")
  augmented_label_path = images_dir.replace("images", augmented_label_file)
  # segmentations_dir = images_dir.replace("images", "segmentations")
  with open(augmented_label_path, 'r') as f:
    label_data = json.load(f)
  print('>>> seed: {}'.format(seed))
  size = 0
  for img_file in os.listdir(images_dir):
    if not(img_file.endswith('.png') or img_file.endswith('.tif') or img_file.endswith('.jpg')):
      continue
    img_path = os.path.join(images_dir, img_file)
    size += 1
    # print(">>> ", img_path)
    seg_classes = label_data[img_path]

    for cls_name in seg_classes.keys():
      mask, prompt = stable_diffusion_inference(img_path, cls_name, device, blip_device, processor, model,
                                                ldm_stable, verbose=False, weight=base_weight, t=base_t, base=True,
                                                seed=seed, negative_token=negative_token, neg_weight=neg_weight,
                                                alpha=a, beta=b)
      if refiner is not None:
        mask_refiner, prompt = stable_diffusion_inference(img_path, cls_name, device, blip_device, processor, model, refiner,
                                                          verbose=False, weight=refiner_weight, t=refiner_t, base=False,
                                                          prompt=prompt, seed=seed, negative_token=negative_token,
                                                          neg_weight=neg_weight, alpha=a, beta=b)
        mask = alpha * mask + (1 - alpha) * mask_refiner
      with open(os.path.join(result_dir, 'mask', '{}_{}.npy'.format(img_file.split('.')[0], cls_name)), 'wb') as f:
        np.save(f, mask)
      for mask_threshold in thres_list:
        mask_binary = np.where(mask > mask_threshold, 255, 0)
        mask_binary_img = Image.fromarray(mask_binary.astype(np.uint8))
        mask_binary_img.save(os.path.join(result_dir, '{}'.format(mask_threshold), '{}_{}.png'.format(img_file.split('.')[0], cls_name)))

      if augmented_label:
        for aug_cls_name in seg_classes[cls_name]:
          mask, prompt = stable_diffusion_inference(img_path, cls_name, device, blip_device, processor, model,
                                                ldm_stable, verbose=False, weight=base_weight, t=base_t, base=True,
                                                seed=seed, negative_token=negative_token, neg_weight=neg_weight,
                                                alpha=a, beta=b)
          if refiner is not None:
            mask_refiner, prompt = stable_diffusion_inference(img_path, cls_name, device, blip_device, processor, model, refiner,
                                                          verbose=False, weight=refiner_weight, t=refiner_t, base=False,
                                                          prompt=prompt, seed=seed, negative_token=negative_token,
                                                          neg_weight=neg_weight, alpha=a, beta=b)
            mask = alpha * mask + (1 - alpha) * mask_refiner
          with open(os.path.join(result_dir, 'mask', '{}_{}.npy'.format(img_file.split('.')[0], aug_cls_name)), 'wb') as f:
            np.save(f, mask)
          for mask_threshold in thres_list:
            mask_binary = np.where(mask > mask_threshold, 255, 0)
            mask_binary_img = Image.fromarray(mask_binary.astype(np.uint8))
            mask_binary_img.save(os.path.join(result_dir, '{}'.format(mask_threshold), '{}_{}.png'.format(img_file.split('.')[0], aug_cls_name)))

    # gt_path = os.path.join(images_dir.replace('images', 'segmentations'), img_file)
    # gt = Image.open(gt_path)
    # display(gt)
  ds_name = images_dir.split('/')[-2]
  print(">>>>>>>>>> dataset: {}, size: {}, test time: {:.2f}s".format(ds_name, size, time.time() - start))


In [None]:
from PIL import Image
import numpy as np
import os, time
from sklearn.metrics import f1_score, roc_curve, auc

def analysis(results_dir_list, segmentations_dir_list, augmented_label_file, thres_list=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]):
  start = time.time()

  iou_res_ = {}
  pixel_acc_res_ = {}
  f1_res_ = {}
  for thres in thres_list:
    iou_res_[thres] = []
    pixel_acc_res_[thres] = []
    f1_res_[thres] = []

  for results_dir, segmentations_dir in zip(results_dir_list, segmentations_dir_list):
    cls_arr_dir = segmentations_dir.replace("segmentations", "class_array")
    images_dir = segmentations_dir.replace("segmentations", "images")
    augmented_label_path = segmentations_dir.replace("segmentations", augmented_label_file)
    print('>>>> ', results_dir)
    with open(augmented_label_path, 'r') as f:
      label_data = json.load(f)

    iou_domain = []
    pixel_acc_domain = []
    f1_domain = []

    for thres in thres_list:
      predict_root_dir = os.path.join(results_dir, '{}'.format(thres))
      if not os.path.isdir(predict_root_dir):
        continue
      iou_res = []
      pixel_acc_res = []
      f1_res = []

      for seg_file in os.listdir(segmentations_dir):
        if not(seg_file.endswith('.png') or seg_file.endswith('.tif') or seg_file.endswith('.jpg')):
          continue
        img_path = os.path.join(images_dir, seg_file)
        seg_classes = label_data[img_path]

        for cls_name in seg_classes.keys():
          all_classes = [cls_name] + seg_classes[cls_name]
          seg_cls_arr = np.load(os.path.join(cls_arr_dir, '{}_{}.npy'.format(seg_file.split('.')[0], cls_name)))
          iou = -1
          pixel_acc = -1
          f1 = -1
          for cls in all_classes:
            predict_path = os.path.join(predict_root_dir, '{}_{}.png'.format(seg_file.split('.')[0], cls))
            if os.path.isfile(predict_path):
              predict_img = Image.open(predict_path)
              predict_cls_arr = np.asarray(predict_img) / 255

              if predict_cls_arr.shape != seg_cls_arr.shape:
                print('>>>invalid prediction', predict_path, seg_cls_arr.shape, predict_cls_arr.shape)
                continue
              intersection = np.sum(predict_cls_arr * seg_cls_arr).astype(np.float32)
              union = np.sum(np.logical_or(predict_cls_arr, seg_cls_arr)).astype(np.float32)
              correct = np.sum(predict_cls_arr == seg_cls_arr).astype(np.float32)

              iou_ = intersection / union
              pixel_acc_ = correct / (seg_cls_arr.shape[0] * seg_cls_arr.shape[1])
              f1_ = f1_score(seg_cls_arr.flatten(), predict_cls_arr.flatten())

              if f1_ > f1:
                f1 = f1_
                pixel_acc = pixel_acc_
                iou = iou_
          iou_res.append(iou)
          pixel_acc_res.append(pixel_acc)
          f1_res.append(f1)

      iou_res_[thres] += iou_res
      pixel_acc_res_[thres] += pixel_acc_res
      f1_res_[thres] += f1_res

      f1_mean = np.array(f1_res).mean()
      iou_mean = np.array(iou_res).mean()
      pixel_acc_mean = np.array(pixel_acc_res).mean()
      iou_domain.append(iou_mean)
      pixel_acc_domain.append(pixel_acc_mean)
      f1_domain.append(f1_mean)
      print('>>>> thres: {}, dice: {:.4f}, iou: {:.4f}, pixel_acc: {:.4f}'.format(thres, f1_mean, iou_mean, pixel_acc_mean))

    iou_domain_auc = auc(np.asarray(thres_list), np.asarray(iou_domain))
    pixel_acc_domain_auc = auc(np.asarray(thres_list), np.asarray(pixel_acc_domain))
    f1_domain_auc = auc(np.asarray(thres_list), np.asarray(f1_domain))
    print('>>> dice AUC: {:.4f}, iou AUC: {:.4f}, pixel_acc AUC: {:.4f}'.format(f1_domain_auc, iou_domain_auc, pixel_acc_domain_auc))

  iou_res_summary = []
  pixel_res_summary = []
  f1_res_summary = []
  for thres in thres_list:
    iou_res_summary.append(np.array(iou_res_[thres]).mean())
    pixel_res_summary.append(np.array(pixel_acc_res_[thres]).mean())
    f1_res_summary.append(np.array(f1_res_[thres]).mean())

  iou_res_summary = np.asarray(iou_res_summary)
  iou_auc = auc(np.asarray(thres_list), iou_res_summary)
  iou_optim = iou_res_summary.max()
  iou_auc_over_optim = iou_auc / iou_optim

  pixel_res_summary = np.asarray(pixel_res_summary)
  pixel_auc = auc(np.asarray(thres_list), pixel_res_summary)
  pixel_optim = pixel_res_summary.max()
  pixel_auc_over_optim = pixel_auc / pixel_optim

  f1_res_summary = np.asarray(f1_res_summary)
  f1_auc = auc(np.asarray(thres_list), f1_res_summary)
  f1_optim = f1_res_summary.max()
  f1_auc_over_optim = f1_auc / f1_optim

  print('>>> dice AUC: {:.4f}, dic optimum: {:.4f}, dice AUC/optim: {:.4f}'.format(f1_auc, f1_optim, f1_auc_over_optim))
  print('>>> iou AUC: {:.4f}, iou optimum: {:.4f}, iou AUC/optim: {:.4f}'.format(iou_auc, iou_optim, iou_auc_over_optim))
  print('>>> pixel_acc AUC: {:.4f}, pixel_acc optimum: {:.4f}, pixel_acc AUC/optim: {:.4f}'.format(pixel_auc, pixel_optim, pixel_auc_over_optim))
  print('>>> analysis time: {:.2f}s'.format(time.time() - start))
  return f1_auc, f1_optim, iou_auc, iou_optim, pixel_auc, pixel_optim



In [None]:
from bayes_opt import BayesianOptimization
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

# model_key = "stabilityai/stable-diffusion-xl-base-1.0"
# model_key = "stabilityai/sdxl-turbo"

# refiner_key = "stabilityai/stable-diffusion-xl-refiner-1.0"
# refiner_key = None

def black_box_function(base_t=100, refiner_t=0, base_map_weight1=0.8, base_map_weight2=0.2, refiner_map_weight1=0.3, refiner_map_weight2=0.5, refiner_map_weight3=0.2,
                       alpha=0.5, seed=3407, negative_token=True, neg_weight=0.8, a=8, b=0.4):
  VOC_label_map = {
    1:'aeroplane',
    2:'bicycle',
    3:'bird',
    4:'boat',
    5:'bottle',
    6:'bus',
    7:'car',
    8:'cat',
    9:'chair',
    10:'cow',
    11:'diningtable',
    12:'dog',
    13:'horse',
    14:'motorbike',
    15:'person',
    16:'pottedplant',
    17:'sheep',
    18:'sofa',
    19:'train',
    20:'tvmonitor'
  }

  Cityscape_label_map = {
    1: 'road', # flat
    2: 'person', # human
    3: 'building', # construction
    4: 'traffic light', # object
    5: 'vegetation', # nature
    6: 'car', # vehicle
    7: 'bus', # vehicle
    8: 'train', # vehicle
    9: 'motorcycle', # vehicle
    10: 'bicycle', #vehicle
  }

  Vaihingen_label_map = {
    1: 'building'
  }

  Kvasir_label_map = {
    1: 'tumor'
  }

  device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
  vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.bfloat16)
  ldm_stable = StableDiffusionXLPipeline.from_pretrained(model_key, vae=vae, torch_dtype=torch.bfloat16, variant="fp16", use_safetensors=True).to(device)
  ldm_stable.scheduler = DDIMScheduler.from_pretrained(model_key, subfolder="scheduler",
                                                     beta_start=0.00085, beta_end=0.012,
                                                     steps_offset=1)
  if refiner_key is not None:
    refiner = DiffusionPipeline.from_pretrained(
      refiner_key,
      text_encoder_2=ldm_stable.text_encoder_2,
      vae=ldm_stable.vae,
      torch_dtype=torch.bfloat16,
      use_safetensors=True,
      variant="fp16",
    ).to(device)
    refiner.scheduler = DDIMScheduler.from_pretrained(refiner_key, subfolder="scheduler",
                                                     beta_start=0.00085, beta_end=0.012,
                                                     steps_offset=1)
  else:
    refiner = None

  blip_device = "cuda:0"
  # blip device
  processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
  model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(blip_device)
  print(model_key)
  if refiner_key is not None:
    print(refiner_key)
  datasets = ["VOC2012", "Cityscape", "Vaihingen", "Kvasir-SEG"]
  label_maps = [VOC_label_map, Cityscape_label_map, Vaihingen_label_map, Kvasir_label_map]
  # datasets = ["VOC2012"]
  # label_maps = [VOC_label_map]
  base_map_weight_sum = base_map_weight1 + base_map_weight2
  base_weight = [base_map_weight1 / base_map_weight_sum, base_map_weight2 / base_map_weight_sum]
  refiner_map_weight_sum = refiner_map_weight1 + refiner_map_weight2 + refiner_map_weight3
  refiner_weight = [refiner_map_weight1 / refiner_map_weight_sum, refiner_map_weight2 / refiner_map_weight_sum, refiner_map_weight3 / refiner_map_weight_sum]
  root_dir = "results_0.9_{}_{}_{:.2f}_{:.2f}_{:.2f}_{:.2f}_{:.2f}_{:.2f}_{}_{:.2f}_{:.2f}_{:.2f}".format(int(base_t),
                                                                                                   int(refiner_t),
                                                                                                   base_weight[0],
                                                                                                   base_weight[1],
                                                                                                   refiner_weight[0],
                                                                                                   refiner_weight[1],
                                                                                                   refiner_weight[2],
                                                                                                   alpha,
                                                                                                   negative_token,
                                                                                                   neg_weight,
                                                                                                   a,
                                                                                                   b)

  augmented_label = True
  thres_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
  if os.path.isdir(root_dir):
    shutil.rmtree(root_dir)
  os.mkdir(root_dir)
  augmented_label_file = 'aug_label_blip_bert_0.9.json'
  for ds, label_map in zip(datasets, label_maps):
    images_dir = os.path.join(ds, "images")
    result_dir = os.path.join(root_dir, ds)
    domain_test(processor, model, ldm_stable, refiner, blip_device, device, images_dir, result_dir, label_map, augmented_label_file, base_weight, refiner_weight, base_t,
                refiner_t, alpha, augmented_label=augmented_label, thres_list=thres_list, seed=seed, negative_token=negative_token,neg_weight=neg_weight, a=a, b=b)

  results_dir_list = [os.path.join(root_dir, ds) for ds in datasets]
  segmentations_dir_list = [os.path.join(ds, "segmentations") for ds in datasets]
  f1_auc, f1_optim, iou_auc, iou_optim, pixel_auc, pixel_optim = analysis(results_dir_list, segmentations_dir_list, augmented_label_file, thres_list=thres_list)

  del ldm_stable
  if refiner is not None:
    del refiner
  del model
  del processor
  torch.cuda.empty_cache()

  return f1_auc


def parameter_tuning(turbo=False, n_iter=10):
  pbounds = {'base_t': (90, 110),
             'refiner_t': (90, 110),
             'base_map_weight1': (0.01, 0.99),
             'base_map_weight2': (0.01, 0.99),
             'refiner_map_weight1': (0.01, 0.99),
             'refiner_map_weight2': (0.01, 0.99),
             'refiner_map_weight3': (0.01, 0.99),
             'alpha': (0.01, 0.99)
            }
  if turbo:
    pbounds = {'base_t': (100, 100),
               'refiner_t': (100, 100),
               'base_map_weight1': (0.99, 0.99),
               'base_map_weight2': (0.01, 0.01),
               'refiner_map_weight1': (1., 1.),
               'refiner_map_weight2': (1., 1.),
               'refiner_map_weight3': (1., 1.),
               'alpha': (1., 1.),
               'neg_weight': (1.,1.),
               'a': (8, 24),
               'b': (0.1, 1.0)
            }

  optimizer = BayesianOptimization(
    f=black_box_function,
    pbounds=pbounds,
    verbose=2, # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent
  )
  if turbo:
    initial_points = [
      {'base_t': 100, 'refiner_t': 100, 'base_map_weight1': 0.8, 'base_map_weight2': 0.2, 'refiner_map_weight1': 1.0, 'refiner_map_weight2': 1.0, 'refiner_map_weight3': 1.0, 'alpha': 1.0, 'neg_weight': 1.0, 'a': 16, 'b':0.57},
    ]
  else:
    initial_points = [
      {'base_t': 100, 'refiner_t': 100, 'base_map_weight1': 0.8, 'base_map_weight2': 0.2, 'refiner_map_weight1': 1.0, 'refiner_map_weight2': 1.0, 'refiner_map_weight3': 1.0, 'alpha': 0.5},
      {'base_t': 100, 'refiner_t': 93, 'base_map_weight1': 0.5, 'base_map_weight2': 0.5, 'refiner_map_weight1': 0.01, 'refiner_map_weight2': 0.98, 'refiner_map_weight3': 0.01, 'alpha': 0.01},
    ]
  for p in initial_points:
    optimizer.probe(p, lazy=True)

  optimizer.maximize(
    init_points=0,
    n_iter= n_iter,
  )

# seed = 3280
# seed = 369
# seed = 200
seed = 3871
model_key = "stabilityai/sdxl-turbo"
refiner_key = None
# black_box_function(base_t=108, refiner_t=93, base_map_weight1=0.99, base_map_weight2=0.01, refiner_map_weight1=0.01, refiner_map_weight2=0.98, refiner_map_weight3=0.01, alpha=0.01, seed=seed, negative_token=True, neg_weight=0.8, a=8, b=0.4)
# black_box_function(base_t=108, refiner_t=93, base_map_weight1=0.99, base_map_weight2=0.01, refiner_map_weight1=0.01, refiner_map_weight2=0.98, refiner_map_weight3=0.01, alpha=0.01, seed=seed, negative_token=True, neg_weight=0.6, a=8, b=0.4)
# black_box_function(base_t=108, refiner_t=93, base_map_weight1=0.99, base_map_weight2=0.01, refiner_map_weight1=0.01, refiner_map_weight2=0.98, refiner_map_weight3=0.01, alpha=0.01, seed=seed, negative_token=True, neg_weight=0.4, a=8, b=0.4)
# black_box_function(base_t=108, refiner_t=93, base_map_weight1=0.99, base_map_weight2=0.01, refiner_map_weight1=0.01, refiner_map_weight2=0.98, refiner_map_weight3=0.01, alpha=0.01, seed=seed, negative_token=True, neg_weight=0.2, a=8, b=0.4)
# black_box_function(base_t=108, refiner_t=93, base_map_weight1=0.99, base_map_weight2=0.01, refiner_map_weight1=0.01, refiner_map_weight2=0.98, refiner_map_weight3=0.01, alpha=0.01, seed=seed, negative_token=True, neg_weight=1.0, a=8, b=0.2)


parameter_tuning(turbo=True, n_iter=50)
# model_key = "stabilityai/stable-diffusion-xl-base-1.0"
# refiner_key = "stabilityai/stable-diffusion-xl-refiner-1.0"
# black_box_function(base_t=108, refiner_t=93, base_map_weight1=0.5, base_map_weight2=0.5, refiner_map_weight1=0.01, refiner_map_weight2=0.98, refiner_map_weight3=0.01, alpha=0.01, seed=seed)


|   iter    |  target   |     a     |   alpha   |     b     | base_m... | base_m... |  base_t   | neg_we... | refine... | refine... | refine... | refiner_t |
-------------------------------------------------------------------------------------------------------------------------------------------------------------


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.50s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 75.22s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.75s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.54s
>>>>  results_0.9_100_100_0.80_0.20_0.33_0.33_0.33_1.00_True_1.00_16.00_0.57/VOC2012
>>>> thres: 0.1, dice: 0.5637, iou: 0.4493, pixel_acc: 0.7484
>>>> thres: 0.2, dice: 0.5803, iou: 0.4664, pixel_acc: 0.7786
>>>> thres: 0.3, dice: 0.5862, iou: 0.4728, pixel_acc: 0.7944
>>>> thres: 0.4, dice: 0.5849, iou: 0.4712, pixel_acc: 0.8029
>>>> thres: 0.5, dice: 0.5810, iou: 0.4669, pixel_acc: 0.8096
>>>> thres: 0.6, dice: 0.5741, iou: 0.4591, pixel_acc: 0.8150
>>>> thres: 0.7, dice: 0.5629, iou: 0.4465, pixel_acc: 0.8191
>>>> thres: 0.8, dice: 0.5438, iou: 0.4257, pixel_acc: 0.8216
>>>> thres: 0.9, dice: 0.5005, iou: 0.3810, pixel_acc: 0.8215
>>> dice AUC: 0.4545, iou AUC: 0.3624, pixel_acc AUC: 0.6426
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.29s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 74.88s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.82s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.53s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_24.00_0.14/VOC2012
>>>> thres: 0.1, dice: 0.3365, iou: 0.2317, pixel_acc: 0.3071
>>>> thres: 0.2, dice: 0.3504, iou: 0.2431, pixel_acc: 0.3541
>>>> thres: 0.3, dice: 0.3609, iou: 0.2517, pixel_acc: 0.3833
>>>> thres: 0.4, dice: 0.3707, iou: 0.2600, pixel_acc: 0.4084
>>>> thres: 0.5, dice: 0.3803, iou: 0.2686, pixel_acc: 0.4317
>>>> thres: 0.6, dice: 0.3908, iou: 0.2780, pixel_acc: 0.4559
>>>> thres: 0.7, dice: 0.4031, iou: 0.2895, pixel_acc: 0.4825
>>>> thres: 0.8, dice: 0.4177, iou: 0.3035, pixel_acc: 0.5123
>>>> thres: 0.9, dice: 0.4410, iou: 0.3267, pixel_acc: 0.5566
>>> dice AUC: 0.3062, iou AUC: 0.2174, pixel_acc AUC: 0.3460
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.23s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 74.37s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.66s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.82s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_12.00_0.70/VOC2012
>>>> thres: 0.1, dice: 0.5867, iou: 0.4744, pixel_acc: 0.7955
>>>> thres: 0.2, dice: 0.5836, iou: 0.4708, pixel_acc: 0.8168
>>>> thres: 0.3, dice: 0.5685, iou: 0.4535, pixel_acc: 0.8236
>>>> thres: 0.4, dice: 0.5460, iou: 0.4289, pixel_acc: 0.8255
>>>> thres: 0.5, dice: 0.5180, iou: 0.4000, pixel_acc: 0.8254
>>>> thres: 0.6, dice: 0.4826, iou: 0.3649, pixel_acc: 0.8238
>>>> thres: 0.7, dice: 0.4374, iou: 0.3218, pixel_acc: 0.8211
>>>> thres: 0.8, dice: 0.3766, iou: 0.2661, pixel_acc: 0.8162
>>>> thres: 0.9, dice: 0.2724, iou: 0.1782, pixel_acc: 0.8061
>>> dice AUC: 0.3942, iou AUC: 0.3032, pixel_acc AUC: 0.6553
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 181.25s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 70.90s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.50s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.28s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_18.31_1.00/VOC2012
>>>> thres: 0.1, dice: 0.3152, iou: 0.2128, pixel_acc: 0.8106
>>>> thres: 0.2, dice: 0.2339, iou: 0.1490, pixel_acc: 0.8023
>>>> thres: 0.3, dice: 0.1811, iou: 0.1112, pixel_acc: 0.7978
>>>> thres: 0.4, dice: 0.1422, iou: 0.0849, pixel_acc: 0.7949
>>>> thres: 0.5, dice: 0.1082, iou: 0.0627, pixel_acc: 0.7925
>>>> thres: 0.6, dice: 0.0772, iou: 0.0433, pixel_acc: 0.7905
>>>> thres: 0.7, dice: 0.0502, iou: 0.0273, pixel_acc: 0.7890
>>>> thres: 0.8, dice: 0.0274, iou: 0.0145, pixel_acc: 0.7878
>>>> thres: 0.9, dice: 0.0091, iou: 0.0046, pixel_acc: 0.7869
>>> dice AUC: 0.0982, iou AUC: 0.0602, pixel_acc AUC: 0.6353
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 182.89s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 74.67s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.85s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.61s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_11.96_0.67/VOC2012
>>>> thres: 0.1, dice: 0.5806, iou: 0.4680, pixel_acc: 0.7801
>>>> thres: 0.2, dice: 0.5878, iou: 0.4754, pixel_acc: 0.8099
>>>> thres: 0.3, dice: 0.5794, iou: 0.4659, pixel_acc: 0.8197
>>>> thres: 0.4, dice: 0.5644, iou: 0.4490, pixel_acc: 0.8243
>>>> thres: 0.5, dice: 0.5419, iou: 0.4245, pixel_acc: 0.8256
>>>> thres: 0.6, dice: 0.5128, iou: 0.3949, pixel_acc: 0.8253
>>>> thres: 0.7, dice: 0.4726, iou: 0.3552, pixel_acc: 0.8233
>>>> thres: 0.8, dice: 0.4151, iou: 0.3011, pixel_acc: 0.8195
>>>> thres: 0.9, dice: 0.3176, iou: 0.2148, pixel_acc: 0.8108
>>> dice AUC: 0.4123, iou AUC: 0.3207, pixel_acc AUC: 0.6543
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.40s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 76.32s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.83s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.59s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_11.39_0.27/VOC2012
>>>> thres: 0.1, dice: 0.3611, iou: 0.2519, pixel_acc: 0.3841
>>>> thres: 0.2, dice: 0.3951, iou: 0.2820, pixel_acc: 0.4654
>>>> thres: 0.3, dice: 0.4229, iou: 0.3087, pixel_acc: 0.5222
>>>> thres: 0.4, dice: 0.4473, iou: 0.3330, pixel_acc: 0.5679
>>>> thres: 0.5, dice: 0.4683, iou: 0.3539, pixel_acc: 0.6039
>>>> thres: 0.6, dice: 0.4901, iou: 0.3758, pixel_acc: 0.6395
>>>> thres: 0.7, dice: 0.5148, iou: 0.4007, pixel_acc: 0.6779
>>>> thres: 0.8, dice: 0.5428, iou: 0.4298, pixel_acc: 0.7174
>>>> thres: 0.9, dice: 0.5754, iou: 0.4627, pixel_acc: 0.7697
>>> dice AUC: 0.3750, iou AUC: 0.2841, pixel_acc AUC: 0.4771
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 182.70s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 75.48s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.80s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.48s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_13.98_0.56/VOC2012
>>>> thres: 0.1, dice: 0.5488, iou: 0.4361, pixel_acc: 0.7271
>>>> thres: 0.2, dice: 0.5754, iou: 0.4627, pixel_acc: 0.7698
>>>> thres: 0.3, dice: 0.5854, iou: 0.4729, pixel_acc: 0.7912
>>>> thres: 0.4, dice: 0.5883, iou: 0.4759, pixel_acc: 0.8052
>>>> thres: 0.5, dice: 0.5866, iou: 0.4742, pixel_acc: 0.8135
>>>> thres: 0.6, dice: 0.5802, iou: 0.4669, pixel_acc: 0.8192
>>>> thres: 0.7, dice: 0.5684, iou: 0.4534, pixel_acc: 0.8236
>>>> thres: 0.8, dice: 0.5442, iou: 0.4270, pixel_acc: 0.8255
>>>> thres: 0.9, dice: 0.4898, iou: 0.3721, pixel_acc: 0.8241
>>> dice AUC: 0.4548, iou AUC: 0.3637, pixel_acc AUC: 0.6424
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.45s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 75.68s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.81s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.53s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_17.08_0.58/VOC2012
>>>> thres: 0.1, dice: 0.5709, iou: 0.4581, pixel_acc: 0.7624
>>>> thres: 0.2, dice: 0.5848, iou: 0.4723, pixel_acc: 0.7897
>>>> thres: 0.3, dice: 0.5882, iou: 0.4759, pixel_acc: 0.8041
>>>> thres: 0.4, dice: 0.5872, iou: 0.4748, pixel_acc: 0.8119
>>>> thres: 0.5, dice: 0.5831, iou: 0.4702, pixel_acc: 0.8172
>>>> thres: 0.6, dice: 0.5761, iou: 0.4622, pixel_acc: 0.8212
>>>> thres: 0.7, dice: 0.5649, iou: 0.4495, pixel_acc: 0.8243
>>>> thres: 0.8, dice: 0.5440, iou: 0.4267, pixel_acc: 0.8255
>>>> thres: 0.9, dice: 0.5008, iou: 0.3829, pixel_acc: 0.8248
>>> dice AUC: 0.4564, iou AUC: 0.3652, pixel_acc AUC: 0.6488
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.95s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 76.89s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.85s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.52s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_16.52_0.37/VOC2012
>>>> thres: 0.1, dice: 0.4468, iou: 0.3325, pixel_acc: 0.5671
>>>> thres: 0.2, dice: 0.4771, iou: 0.3626, pixel_acc: 0.6183
>>>> thres: 0.3, dice: 0.4978, iou: 0.3834, pixel_acc: 0.6521
>>>> thres: 0.4, dice: 0.5154, iou: 0.4013, pixel_acc: 0.6788
>>>> thres: 0.5, dice: 0.5312, iou: 0.4177, pixel_acc: 0.7008
>>>> thres: 0.6, dice: 0.5443, iou: 0.4315, pixel_acc: 0.7198
>>>> thres: 0.7, dice: 0.5573, iou: 0.4446, pixel_acc: 0.7412
>>>> thres: 0.8, dice: 0.5724, iou: 0.4596, pixel_acc: 0.7647
>>>> thres: 0.9, dice: 0.5859, iou: 0.4735, pixel_acc: 0.7925
>>> dice AUC: 0.4212, iou AUC: 0.3304, pixel_acc AUC: 0.5556
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 184.60s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 75.68s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.91s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.79s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_17.09_0.58/VOC2012
>>>> thres: 0.1, dice: 0.5713, iou: 0.4584, pixel_acc: 0.7630
>>>> thres: 0.2, dice: 0.5850, iou: 0.4725, pixel_acc: 0.7902
>>>> thres: 0.3, dice: 0.5883, iou: 0.4759, pixel_acc: 0.8045
>>>> thres: 0.4, dice: 0.5871, iou: 0.4747, pixel_acc: 0.8122
>>>> thres: 0.5, dice: 0.5829, iou: 0.4700, pixel_acc: 0.8174
>>>> thres: 0.6, dice: 0.5758, iou: 0.4618, pixel_acc: 0.8213
>>>> thres: 0.7, dice: 0.5644, iou: 0.4490, pixel_acc: 0.8243
>>>> thres: 0.8, dice: 0.5433, iou: 0.4260, pixel_acc: 0.8255
>>>> thres: 0.9, dice: 0.4999, iou: 0.3821, pixel_acc: 0.8247
>>> dice AUC: 0.4562, iou AUC: 0.3650, pixel_acc AUC: 0.6489
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.57s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 75.43s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.76s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.64s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_14.91_0.59/VOC2012
>>>> thres: 0.1, dice: 0.5671, iou: 0.4542, pixel_acc: 0.7564
>>>> thres: 0.2, dice: 0.5844, iou: 0.4719, pixel_acc: 0.7888
>>>> thres: 0.3, dice: 0.5883, iou: 0.4759, pixel_acc: 0.8052
>>>> thres: 0.4, dice: 0.5865, iou: 0.4740, pixel_acc: 0.8136
>>>> thres: 0.5, dice: 0.5805, iou: 0.4672, pixel_acc: 0.8190
>>>> thres: 0.6, dice: 0.5707, iou: 0.4560, pixel_acc: 0.8230
>>>> thres: 0.7, dice: 0.5542, iou: 0.4378, pixel_acc: 0.8250
>>>> thres: 0.8, dice: 0.5262, iou: 0.4083, pixel_acc: 0.8256
>>>> thres: 0.9, dice: 0.4681, iou: 0.3509, pixel_acc: 0.8231
>>> dice AUC: 0.4508, iou AUC: 0.3594, pixel_acc AUC: 0.6490
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 182.85s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 74.51s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.78s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.32s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_15.48_0.10/VOC2012
>>>> thres: 0.1, dice: 0.3252, iou: 0.2222, pixel_acc: 0.2601
>>>> thres: 0.2, dice: 0.3361, iou: 0.2314, pixel_acc: 0.3055
>>>> thres: 0.3, dice: 0.3475, iou: 0.2408, pixel_acc: 0.3454
>>>> thres: 0.4, dice: 0.3585, iou: 0.2497, pixel_acc: 0.3771
>>>> thres: 0.5, dice: 0.3707, iou: 0.2600, pixel_acc: 0.4085
>>>> thres: 0.6, dice: 0.3847, iou: 0.2725, pixel_acc: 0.4421
>>>> thres: 0.7, dice: 0.4022, iou: 0.2886, pixel_acc: 0.4805
>>>> thres: 0.8, dice: 0.4241, iou: 0.3099, pixel_acc: 0.5245
>>>> thres: 0.9, dice: 0.4578, iou: 0.3435, pixel_acc: 0.5863
>>> dice AUC: 0.3015, iou AUC: 0.2136, pixel_acc AUC: 0.3307
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 182.31s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 71.42s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.52s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.39s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_14.47_1.00/VOC2012
>>>> thres: 0.1, dice: 0.3891, iou: 0.2772, pixel_acc: 0.8172
>>>> thres: 0.2, dice: 0.3015, iou: 0.2015, pixel_acc: 0.8092
>>>> thres: 0.3, dice: 0.2367, iou: 0.1510, pixel_acc: 0.8026
>>>> thres: 0.4, dice: 0.1862, iou: 0.1147, pixel_acc: 0.7982
>>>> thres: 0.5, dice: 0.1453, iou: 0.0870, pixel_acc: 0.7951
>>>> thres: 0.6, dice: 0.1073, iou: 0.0621, pixel_acc: 0.7924
>>>> thres: 0.7, dice: 0.0713, iou: 0.0398, pixel_acc: 0.7901
>>>> thres: 0.8, dice: 0.0398, iou: 0.0214, pixel_acc: 0.7884
>>>> thres: 0.9, dice: 0.0138, iou: 0.0071, pixel_acc: 0.7871
>>> dice AUC: 0.1289, iou AUC: 0.0820, pixel_acc AUC: 0.6378
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.97s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 75.84s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.69s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.27s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_13.72_0.23/VOC2012
>>>> thres: 0.1, dice: 0.3556, iou: 0.2473, pixel_acc: 0.3693
>>>> thres: 0.2, dice: 0.3831, iou: 0.2710, pixel_acc: 0.4383
>>>> thres: 0.3, dice: 0.4062, iou: 0.2924, pixel_acc: 0.4891
>>>> thres: 0.4, dice: 0.4264, iou: 0.3121, pixel_acc: 0.5289
>>>> thres: 0.5, dice: 0.4458, iou: 0.3315, pixel_acc: 0.5653
>>>> thres: 0.6, dice: 0.4636, iou: 0.3491, pixel_acc: 0.5960
>>>> thres: 0.7, dice: 0.4837, iou: 0.3693, pixel_acc: 0.6290
>>>> thres: 0.8, dice: 0.5084, iou: 0.3942, pixel_acc: 0.6686
>>>> thres: 0.9, dice: 0.5441, iou: 0.4312, pixel_acc: 0.7195
>>> dice AUC: 0.3567, iou AUC: 0.2659, pixel_acc AUC: 0.4459
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 182.50s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 72.81s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.73s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.57s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_16.26_0.82/VOC2012
>>>> thres: 0.1, dice: 0.5300, iou: 0.4122, pixel_acc: 0.8256
>>>> thres: 0.2, dice: 0.4787, iou: 0.3612, pixel_acc: 0.8236
>>>> thres: 0.3, dice: 0.4363, iou: 0.3207, pixel_acc: 0.8211
>>>> thres: 0.4, dice: 0.3984, iou: 0.2857, pixel_acc: 0.8181
>>>> thres: 0.5, dice: 0.3612, iou: 0.2524, pixel_acc: 0.8148
>>>> thres: 0.6, dice: 0.3212, iou: 0.2178, pixel_acc: 0.8112
>>>> thres: 0.7, dice: 0.2731, iou: 0.1787, pixel_acc: 0.8061
>>>> thres: 0.8, dice: 0.2142, iou: 0.1346, pixel_acc: 0.8006
>>>> thres: 0.9, dice: 0.1348, iou: 0.0800, pixel_acc: 0.7943
>>> dice AUC: 0.2816, iou AUC: 0.1997, pixel_acc AUC: 0.6505
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.60s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 76.24s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.87s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.59s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_12.21_0.52/VOC2012
>>>> thres: 0.1, dice: 0.5112, iou: 0.3970, pixel_acc: 0.6728
>>>> thres: 0.2, dice: 0.5493, iou: 0.4366, pixel_acc: 0.7280
>>>> thres: 0.3, dice: 0.5705, iou: 0.4576, pixel_acc: 0.7616
>>>> thres: 0.4, dice: 0.5818, iou: 0.4692, pixel_acc: 0.7826
>>>> thres: 0.5, dice: 0.5878, iou: 0.4754, pixel_acc: 0.7996
>>>> thres: 0.6, dice: 0.5876, iou: 0.4752, pixel_acc: 0.8108
>>>> thres: 0.7, dice: 0.5810, iou: 0.4678, pixel_acc: 0.8188
>>>> thres: 0.8, dice: 0.5635, iou: 0.4479, pixel_acc: 0.8244
>>>> thres: 0.9, dice: 0.5124, iou: 0.3945, pixel_acc: 0.8253
>>> dice AUC: 0.4533, iou AUC: 0.3626, pixel_acc AUC: 0.6275
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.87s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 76.07s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.84s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.63s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_11.99_0.22/VOC2012
>>>> thres: 0.1, dice: 0.3486, iou: 0.2417, pixel_acc: 0.3488
>>>> thres: 0.2, dice: 0.3752, iou: 0.2640, pixel_acc: 0.4194
>>>> thres: 0.3, dice: 0.3994, iou: 0.2860, pixel_acc: 0.4746
>>>> thres: 0.4, dice: 0.4213, iou: 0.3071, pixel_acc: 0.5192
>>>> thres: 0.5, dice: 0.4429, iou: 0.3287, pixel_acc: 0.5602
>>>> thres: 0.6, dice: 0.4630, iou: 0.3486, pixel_acc: 0.5951
>>>> thres: 0.7, dice: 0.4858, iou: 0.3714, pixel_acc: 0.6323
>>>> thres: 0.8, dice: 0.5142, iou: 0.4001, pixel_acc: 0.6771
>>>> thres: 0.9, dice: 0.5521, iou: 0.4394, pixel_acc: 0.7325
>>> dice AUC: 0.3552, iou AUC: 0.2646, pixel_acc AUC: 0.4418
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.88s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 77.17s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.88s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.49s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_18.81_0.37/VOC2012
>>>> thres: 0.1, dice: 0.4571, iou: 0.3428, pixel_acc: 0.5851
>>>> thres: 0.2, dice: 0.4844, iou: 0.3699, pixel_acc: 0.6300
>>>> thres: 0.3, dice: 0.5023, iou: 0.3880, pixel_acc: 0.6593
>>>> thres: 0.4, dice: 0.5182, iou: 0.4041, pixel_acc: 0.6827
>>>> thres: 0.5, dice: 0.5320, iou: 0.4184, pixel_acc: 0.7017
>>>> thres: 0.6, dice: 0.5435, iou: 0.4306, pixel_acc: 0.7185
>>>> thres: 0.7, dice: 0.5549, iou: 0.4422, pixel_acc: 0.7373
>>>> thres: 0.8, dice: 0.5687, iou: 0.4558, pixel_acc: 0.7588
>>>> thres: 0.9, dice: 0.5825, iou: 0.4700, pixel_acc: 0.7842
>>> dice AUC: 0.4224, iou AUC: 0.3315, pixel_acc AUC: 0.5573
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.42s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 76.28s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.78s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.32s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_17.01_0.24/VOC2012
>>>> thres: 0.1, dice: 0.3667, iou: 0.2566, pixel_acc: 0.3985
>>>> thres: 0.2, dice: 0.3932, iou: 0.2803, pixel_acc: 0.4614
>>>> thres: 0.3, dice: 0.4131, iou: 0.2991, pixel_acc: 0.5035
>>>> thres: 0.4, dice: 0.4304, iou: 0.3161, pixel_acc: 0.5368
>>>> thres: 0.5, dice: 0.4464, iou: 0.3321, pixel_acc: 0.5664
>>>> thres: 0.6, dice: 0.4609, iou: 0.3465, pixel_acc: 0.5916
>>>> thres: 0.7, dice: 0.4772, iou: 0.3627, pixel_acc: 0.6185
>>>> thres: 0.8, dice: 0.4974, iou: 0.3830, pixel_acc: 0.6514
>>>> thres: 0.9, dice: 0.5285, iou: 0.4148, pixel_acc: 0.6971
>>> dice AUC: 0.3566, iou AUC: 0.2656, pixel_acc AUC: 0.4477
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.86s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 75.79s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.75s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.40s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_8.25_0.15/VOC2012
>>>> thres: 0.1, dice: 0.3318, iou: 0.2277, pixel_acc: 0.2882
>>>> thres: 0.2, dice: 0.3522, iou: 0.2446, pixel_acc: 0.3596
>>>> thres: 0.3, dice: 0.3736, iou: 0.2626, pixel_acc: 0.4156
>>>> thres: 0.4, dice: 0.3977, iou: 0.2844, pixel_acc: 0.4710
>>>> thres: 0.5, dice: 0.4235, iou: 0.3092, pixel_acc: 0.5232
>>>> thres: 0.6, dice: 0.4514, iou: 0.3371, pixel_acc: 0.5752
>>>> thres: 0.7, dice: 0.4814, iou: 0.3669, pixel_acc: 0.6252
>>>> thres: 0.8, dice: 0.5208, iou: 0.4069, pixel_acc: 0.6864
>>>> thres: 0.9, dice: 0.5698, iou: 0.4570, pixel_acc: 0.7606
>>> dice AUC: 0.3451, iou AUC: 0.2554, pixel_acc AUC: 0.4181
>>>>  results_0

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 181.87s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 71.91s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.65s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.68s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_17.06_0.88/VOC2012
>>>> thres: 0.1, dice: 0.4627, iou: 0.3457, pixel_acc: 0.8227
>>>> thres: 0.2, dice: 0.3977, iou: 0.2851, pixel_acc: 0.8180
>>>> thres: 0.3, dice: 0.3499, iou: 0.2425, pixel_acc: 0.8138
>>>> thres: 0.4, dice: 0.3067, iou: 0.2058, pixel_acc: 0.8097
>>>> thres: 0.5, dice: 0.2631, iou: 0.1711, pixel_acc: 0.8051
>>>> thres: 0.6, dice: 0.2205, iou: 0.1392, pixel_acc: 0.8012
>>>> thres: 0.7, dice: 0.1758, iou: 0.1076, pixel_acc: 0.7974
>>>> thres: 0.8, dice: 0.1272, iou: 0.0750, pixel_acc: 0.7938
>>>> thres: 0.9, dice: 0.0639, iou: 0.0353, pixel_acc: 0.7897
>>> dice AUC: 0.2104, iou AUC: 0.1417, pixel_acc AUC: 0.6445
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 182.43s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 72.89s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.62s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.68s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_9.14_0.86/VOC2012
>>>> thres: 0.1, dice: 0.5811, iou: 0.4680, pixel_acc: 0.8186
>>>> thres: 0.2, dice: 0.5345, iou: 0.4167, pixel_acc: 0.8256
>>>> thres: 0.3, dice: 0.4783, iou: 0.3607, pixel_acc: 0.8236
>>>> thres: 0.4, dice: 0.4208, iou: 0.3065, pixel_acc: 0.8199
>>>> thres: 0.5, dice: 0.3639, iou: 0.2548, pixel_acc: 0.8150
>>>> thres: 0.6, dice: 0.3029, iou: 0.2027, pixel_acc: 0.8093
>>>> thres: 0.7, dice: 0.2334, iou: 0.1486, pixel_acc: 0.8023
>>>> thres: 0.8, dice: 0.1600, iou: 0.0969, pixel_acc: 0.7963
>>>> thres: 0.9, dice: 0.0755, iou: 0.0423, pixel_acc: 0.7904
>>> dice AUC: 0.2822, iou AUC: 0.2042, pixel_acc AUC: 0.6497
>>>>  results_0

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 182.89s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 74.37s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.74s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.69s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_13.78_0.69/VOC2012
>>>> thres: 0.1, dice: 0.5883, iou: 0.4759, pixel_acc: 0.8048
>>>> thres: 0.2, dice: 0.5802, iou: 0.4669, pixel_acc: 0.8192
>>>> thres: 0.3, dice: 0.5647, iou: 0.4493, pixel_acc: 0.8243
>>>> thres: 0.4, dice: 0.5435, iou: 0.4263, pixel_acc: 0.8255
>>>> thres: 0.5, dice: 0.5188, iou: 0.4008, pixel_acc: 0.8254
>>>> thres: 0.6, dice: 0.4880, iou: 0.3703, pixel_acc: 0.8240
>>>> thres: 0.7, dice: 0.4493, iou: 0.3331, pixel_acc: 0.8220
>>>> thres: 0.8, dice: 0.3961, iou: 0.2836, pixel_acc: 0.8179
>>>> thres: 0.9, dice: 0.3063, iou: 0.2054, pixel_acc: 0.8097
>>> dice AUC: 0.3988, iou AUC: 0.3071, pixel_acc AUC: 0.6566
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.23s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 76.74s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.94s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.67s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_14.87_0.34/VOC2012
>>>> thres: 0.1, dice: 0.4173, iou: 0.3031, pixel_acc: 0.5116
>>>> thres: 0.2, dice: 0.4527, iou: 0.3384, pixel_acc: 0.5775
>>>> thres: 0.3, dice: 0.4749, iou: 0.3604, pixel_acc: 0.6148
>>>> thres: 0.4, dice: 0.4936, iou: 0.3793, pixel_acc: 0.6453
>>>> thres: 0.5, dice: 0.5112, iou: 0.3970, pixel_acc: 0.6728
>>>> thres: 0.6, dice: 0.5291, iou: 0.4154, pixel_acc: 0.6978
>>>> thres: 0.7, dice: 0.5451, iou: 0.4323, pixel_acc: 0.7211
>>>> thres: 0.8, dice: 0.5627, iou: 0.4499, pixel_acc: 0.7497
>>>> thres: 0.9, dice: 0.5822, iou: 0.4697, pixel_acc: 0.7835
>>> dice AUC: 0.4069, iou AUC: 0.3159, pixel_acc AUC: 0.5327
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 184.15s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 76.42s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.85s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.38s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_12.47_0.48/VOC2012
>>>> thres: 0.1, dice: 0.4875, iou: 0.3731, pixel_acc: 0.6351
>>>> thres: 0.2, dice: 0.5289, iou: 0.4153, pixel_acc: 0.6977
>>>> thres: 0.3, dice: 0.5514, iou: 0.4387, pixel_acc: 0.7314
>>>> thres: 0.4, dice: 0.5685, iou: 0.4556, pixel_acc: 0.7585
>>>> thres: 0.5, dice: 0.5797, iou: 0.4671, pixel_acc: 0.7782
>>>> thres: 0.6, dice: 0.5868, iou: 0.4744, pixel_acc: 0.7955
>>>> thres: 0.7, dice: 0.5880, iou: 0.4757, pixel_acc: 0.8088
>>>> thres: 0.8, dice: 0.5809, iou: 0.4677, pixel_acc: 0.8188
>>>> thres: 0.9, dice: 0.5498, iou: 0.4330, pixel_acc: 0.8253
>>> dice AUC: 0.4503, iou AUC: 0.3597, pixel_acc AUC: 0.6119
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.30s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 73.75s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.79s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.66s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_12.47_0.75/VOC2012
>>>> thres: 0.1, dice: 0.5855, iou: 0.4729, pixel_acc: 0.8150
>>>> thres: 0.2, dice: 0.5628, iou: 0.4471, pixel_acc: 0.8245
>>>> thres: 0.3, dice: 0.5320, iou: 0.4141, pixel_acc: 0.8256
>>>> thres: 0.4, dice: 0.4983, iou: 0.3805, pixel_acc: 0.8246
>>>> thres: 0.5, dice: 0.4608, iou: 0.3439, pixel_acc: 0.8226
>>>> thres: 0.6, dice: 0.4185, iou: 0.3043, pixel_acc: 0.8197
>>>> thres: 0.7, dice: 0.3691, iou: 0.2594, pixel_acc: 0.8155
>>>> thres: 0.8, dice: 0.3031, iou: 0.2028, pixel_acc: 0.8093
>>>> thres: 0.9, dice: 0.1979, iou: 0.1229, pixel_acc: 0.7992
>>> dice AUC: 0.3536, iou AUC: 0.2650, pixel_acc AUC: 0.6549
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.80s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 76.57s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.82s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.74s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_12.38_0.25/VOC2012
>>>> thres: 0.1, dice: 0.3584, iou: 0.2497, pixel_acc: 0.3769
>>>> thres: 0.2, dice: 0.3892, iou: 0.2766, pixel_acc: 0.4523
>>>> thres: 0.3, dice: 0.4147, iou: 0.3006, pixel_acc: 0.5066
>>>> thres: 0.4, dice: 0.4376, iou: 0.3233, pixel_acc: 0.5505
>>>> thres: 0.5, dice: 0.4576, iou: 0.3432, pixel_acc: 0.5858
>>>> thres: 0.6, dice: 0.4774, iou: 0.3629, pixel_acc: 0.6189
>>>> thres: 0.7, dice: 0.4997, iou: 0.3854, pixel_acc: 0.6552
>>>> thres: 0.8, dice: 0.5280, iou: 0.4142, pixel_acc: 0.6963
>>>> thres: 0.9, dice: 0.5614, iou: 0.4487, pixel_acc: 0.7478
>>> dice AUC: 0.3664, iou AUC: 0.2755, pixel_acc AUC: 0.4628
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.68s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 75.45s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.96s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.73s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_15.18_0.62/VOC2012
>>>> thres: 0.1, dice: 0.5776, iou: 0.4648, pixel_acc: 0.7739
>>>> thres: 0.2, dice: 0.5879, iou: 0.4756, pixel_acc: 0.8012
>>>> thres: 0.3, dice: 0.5869, iou: 0.4745, pixel_acc: 0.8126
>>>> thres: 0.4, dice: 0.5811, iou: 0.4679, pixel_acc: 0.8187
>>>> thres: 0.5, dice: 0.5717, iou: 0.4572, pixel_acc: 0.8227
>>>> thres: 0.6, dice: 0.5578, iou: 0.4416, pixel_acc: 0.8248
>>>> thres: 0.7, dice: 0.5369, iou: 0.4192, pixel_acc: 0.8256
>>>> thres: 0.8, dice: 0.5042, iou: 0.3863, pixel_acc: 0.8250
>>>> thres: 0.9, dice: 0.4397, iou: 0.3239, pixel_acc: 0.8213
>>> dice AUC: 0.4435, iou AUC: 0.3517, pixel_acc AUC: 0.6528
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.49s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 76.49s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.96s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.57s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_23.55_0.50/VOC2012
>>>> thres: 0.1, dice: 0.5474, iou: 0.4346, pixel_acc: 0.7248
>>>> thres: 0.2, dice: 0.5642, iou: 0.4513, pixel_acc: 0.7521
>>>> thres: 0.3, dice: 0.5742, iou: 0.4614, pixel_acc: 0.7676
>>>> thres: 0.4, dice: 0.5798, iou: 0.4671, pixel_acc: 0.7784
>>>> thres: 0.5, dice: 0.5841, iou: 0.4716, pixel_acc: 0.7880
>>>> thres: 0.6, dice: 0.5871, iou: 0.4748, pixel_acc: 0.7967
>>>> thres: 0.7, dice: 0.5883, iou: 0.4759, pixel_acc: 0.8046
>>>> thres: 0.8, dice: 0.5874, iou: 0.4751, pixel_acc: 0.8114
>>>> thres: 0.9, dice: 0.5808, iou: 0.4676, pixel_acc: 0.8189
>>> dice AUC: 0.4629, iou AUC: 0.3728, pixel_acc AUC: 0.6271
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.89s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 74.46s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.83s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.83s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_23.39_0.70/VOC2012
>>>> thres: 0.1, dice: 0.5750, iou: 0.4609, pixel_acc: 0.8216
>>>> thres: 0.2, dice: 0.5577, iou: 0.4416, pixel_acc: 0.8248
>>>> thres: 0.3, dice: 0.5414, iou: 0.4240, pixel_acc: 0.8256
>>>> thres: 0.4, dice: 0.5256, iou: 0.4076, pixel_acc: 0.8256
>>>> thres: 0.5, dice: 0.5091, iou: 0.3912, pixel_acc: 0.8252
>>>> thres: 0.6, dice: 0.4899, iou: 0.3722, pixel_acc: 0.8241
>>>> thres: 0.7, dice: 0.4673, iou: 0.3501, pixel_acc: 0.8230
>>>> thres: 0.8, dice: 0.4368, iou: 0.3212, pixel_acc: 0.8211
>>>> thres: 0.9, dice: 0.3870, iou: 0.2754, pixel_acc: 0.8171
>>> dice AUC: 0.4009, iou AUC: 0.3076, pixel_acc AUC: 0.6589
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 183.85s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 77.23s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.77s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.51s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_23.37_0.33/VOC2012
>>>> thres: 0.1, dice: 0.4450, iou: 0.3307, pixel_acc: 0.5639
>>>> thres: 0.2, dice: 0.4666, iou: 0.3522, pixel_acc: 0.6010
>>>> thres: 0.3, dice: 0.4812, iou: 0.3667, pixel_acc: 0.6250
>>>> thres: 0.4, dice: 0.4932, iou: 0.3788, pixel_acc: 0.6446
>>>> thres: 0.5, dice: 0.5043, iou: 0.3900, pixel_acc: 0.6623
>>>> thres: 0.6, dice: 0.5160, iou: 0.4019, pixel_acc: 0.6796
>>>> thres: 0.7, dice: 0.5283, iou: 0.4146, pixel_acc: 0.6968
>>>> thres: 0.8, dice: 0.5414, iou: 0.4284, pixel_acc: 0.7152
>>>> thres: 0.9, dice: 0.5582, iou: 0.4455, pixel_acc: 0.7427
>>> dice AUC: 0.4033, iou AUC: 0.3121, pixel_acc AUC: 0.5278
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 182.75s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 74.34s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.82s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.58s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_23.78_0.67/VOC2012
>>>> thres: 0.1, dice: 0.5830, iou: 0.4702, pixel_acc: 0.8172
>>>> thres: 0.2, dice: 0.5719, iou: 0.4574, pixel_acc: 0.8226
>>>> thres: 0.3, dice: 0.5605, iou: 0.4447, pixel_acc: 0.8246
>>>> thres: 0.4, dice: 0.5482, iou: 0.4312, pixel_acc: 0.8254
>>>> thres: 0.5, dice: 0.5349, iou: 0.4172, pixel_acc: 0.8256
>>>> thres: 0.6, dice: 0.5198, iou: 0.4019, pixel_acc: 0.8254
>>>> thres: 0.7, dice: 0.5010, iou: 0.3832, pixel_acc: 0.8248
>>>> thres: 0.8, dice: 0.4748, iou: 0.3574, pixel_acc: 0.8234
>>>> thres: 0.9, dice: 0.4298, iou: 0.3147, pixel_acc: 0.8206
>>> dice AUC: 0.4218, iou AUC: 0.3285, pixel_acc AUC: 0.6591
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 181.99s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 76.55s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.71s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.26s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_14.19_0.35/VOC2012
>>>> thres: 0.1, dice: 0.4190, iou: 0.3048, pixel_acc: 0.5149
>>>> thres: 0.2, dice: 0.4557, iou: 0.3413, pixel_acc: 0.5826
>>>> thres: 0.3, dice: 0.4790, iou: 0.3645, pixel_acc: 0.6214
>>>> thres: 0.4, dice: 0.4986, iou: 0.3843, pixel_acc: 0.6535
>>>> thres: 0.5, dice: 0.5175, iou: 0.4034, pixel_acc: 0.6817
>>>> thres: 0.6, dice: 0.5352, iou: 0.4218, pixel_acc: 0.7063
>>>> thres: 0.7, dice: 0.5508, iou: 0.4381, pixel_acc: 0.7303
>>>> thres: 0.8, dice: 0.5691, iou: 0.4563, pixel_acc: 0.7595
>>>> thres: 0.9, dice: 0.5859, iou: 0.4735, pixel_acc: 0.7925
>>> dice AUC: 0.4108, iou AUC: 0.3199, pixel_acc AUC: 0.5389
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 181.00s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 71.48s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.48s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.48s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_20.67_0.87/VOC2012
>>>> thres: 0.1, dice: 0.4343, iou: 0.3189, pixel_acc: 0.8209
>>>> thres: 0.2, dice: 0.3776, iou: 0.2670, pixel_acc: 0.8162
>>>> thres: 0.3, dice: 0.3359, iou: 0.2304, pixel_acc: 0.8126
>>>> thres: 0.4, dice: 0.2980, iou: 0.1987, pixel_acc: 0.8088
>>>> thres: 0.5, dice: 0.2604, iou: 0.1690, pixel_acc: 0.8049
>>>> thres: 0.6, dice: 0.2236, iou: 0.1414, pixel_acc: 0.8014
>>>> thres: 0.7, dice: 0.1837, iou: 0.1130, pixel_acc: 0.7980
>>>> thres: 0.8, dice: 0.1395, iou: 0.0832, pixel_acc: 0.7947
>>>> thres: 0.9, dice: 0.0782, iou: 0.0439, pixel_acc: 0.7906
>>> dice AUC: 0.2075, iou AUC: 0.1384, pixel_acc AUC: 0.6442
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 182.57s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 76.58s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.79s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.28s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_16.12_0.30/VOC2012
>>>> thres: 0.1, dice: 0.3978, iou: 0.2845, pixel_acc: 0.4711
>>>> thres: 0.2, dice: 0.4299, iou: 0.3156, pixel_acc: 0.5358
>>>> thres: 0.3, dice: 0.4518, iou: 0.3375, pixel_acc: 0.5759
>>>> thres: 0.4, dice: 0.4686, iou: 0.3542, pixel_acc: 0.6044
>>>> thres: 0.5, dice: 0.4846, iou: 0.3701, pixel_acc: 0.6304
>>>> thres: 0.6, dice: 0.5003, iou: 0.3860, pixel_acc: 0.6561
>>>> thres: 0.7, dice: 0.5186, iou: 0.4045, pixel_acc: 0.6832
>>>> thres: 0.8, dice: 0.5388, iou: 0.4256, pixel_acc: 0.7114
>>>> thres: 0.9, dice: 0.5634, iou: 0.4506, pixel_acc: 0.7508
>>> dice AUC: 0.3873, iou AUC: 0.2961, pixel_acc AUC: 0.5008
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 181.21s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 71.45s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.46s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.25s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_15.11_0.91/VOC2012
>>>> thres: 0.1, dice: 0.4591, iou: 0.3423, pixel_acc: 0.8225
>>>> thres: 0.2, dice: 0.3855, iou: 0.2740, pixel_acc: 0.8169
>>>> thres: 0.3, dice: 0.3307, iou: 0.2259, pixel_acc: 0.8121
>>>> thres: 0.4, dice: 0.2804, iou: 0.1845, pixel_acc: 0.8069
>>>> thres: 0.5, dice: 0.2334, iou: 0.1486, pixel_acc: 0.8023
>>>> thres: 0.6, dice: 0.1881, iou: 0.1161, pixel_acc: 0.7983
>>>> thres: 0.7, dice: 0.1443, iou: 0.0863, pixel_acc: 0.7951
>>>> thres: 0.8, dice: 0.0959, iou: 0.0549, pixel_acc: 0.7917
>>>> thres: 0.9, dice: 0.0420, iou: 0.0226, pixel_acc: 0.7885
>>> dice AUC: 0.1909, iou AUC: 0.1273, pixel_acc AUC: 0.6429
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 182.24s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 76.38s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.78s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.23s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_15.11_0.43/VOC2012
>>>> thres: 0.1, dice: 0.4745, iou: 0.3600, pixel_acc: 0.6141
>>>> thres: 0.2, dice: 0.5085, iou: 0.3943, pixel_acc: 0.6687
>>>> thres: 0.3, dice: 0.5319, iou: 0.4183, pixel_acc: 0.7016
>>>> thres: 0.4, dice: 0.5470, iou: 0.4343, pixel_acc: 0.7241
>>>> thres: 0.5, dice: 0.5600, iou: 0.4472, pixel_acc: 0.7456
>>>> thres: 0.6, dice: 0.5724, iou: 0.4596, pixel_acc: 0.7647
>>>> thres: 0.7, dice: 0.5813, iou: 0.4687, pixel_acc: 0.7815
>>>> thres: 0.8, dice: 0.5878, iou: 0.4755, pixel_acc: 0.7998
>>>> thres: 0.9, dice: 0.5846, iou: 0.4720, pixel_acc: 0.8160
>>> dice AUC: 0.4418, iou AUC: 0.3514, pixel_acc AUC: 0.5901
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 181.80s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 74.85s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.65s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.48s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_15.76_0.60/VOC2012
>>>> thres: 0.1, dice: 0.5724, iou: 0.4596, pixel_acc: 0.7647
>>>> thres: 0.2, dice: 0.5862, iou: 0.4738, pixel_acc: 0.7936
>>>> thres: 0.3, dice: 0.5881, iou: 0.4758, pixel_acc: 0.8078
>>>> thres: 0.4, dice: 0.5855, iou: 0.4730, pixel_acc: 0.8150
>>>> thres: 0.5, dice: 0.5793, iou: 0.4658, pixel_acc: 0.8198
>>>> thres: 0.6, dice: 0.5694, iou: 0.4546, pixel_acc: 0.8233
>>>> thres: 0.7, dice: 0.5532, iou: 0.4367, pixel_acc: 0.8251
>>>> thres: 0.8, dice: 0.5266, iou: 0.4087, pixel_acc: 0.8256
>>>> thres: 0.9, dice: 0.4721, iou: 0.3547, pixel_acc: 0.8233
>>> dice AUC: 0.4511, iou AUC: 0.3595, pixel_acc AUC: 0.6504
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 182.52s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 75.22s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.70s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.59s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_15.48_0.58/VOC2012
>>>> thres: 0.1, dice: 0.5654, iou: 0.4525, pixel_acc: 0.7538
>>>> thres: 0.2, dice: 0.5830, iou: 0.4704, pixel_acc: 0.7855
>>>> thres: 0.3, dice: 0.5881, iou: 0.4757, pixel_acc: 0.8024
>>>> thres: 0.4, dice: 0.5873, iou: 0.4749, pixel_acc: 0.8115
>>>> thres: 0.5, dice: 0.5829, iou: 0.4700, pixel_acc: 0.8174
>>>> thres: 0.6, dice: 0.5748, iou: 0.4607, pixel_acc: 0.8217
>>>> thres: 0.7, dice: 0.5613, iou: 0.4456, pixel_acc: 0.8246
>>>> thres: 0.8, dice: 0.5368, iou: 0.4191, pixel_acc: 0.8256
>>>> thres: 0.9, dice: 0.4850, iou: 0.3673, pixel_acc: 0.8239
>>> dice AUC: 0.4539, iou AUC: 0.3626, pixel_acc AUC: 0.6478
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 182.66s
>>> seed: 3407
>>>>>>>>>> dataset: Cityscape, size: 10, test time: 74.48s
>>> seed: 3407
>>>>>>>>>> dataset: Vaihingen, size: 20, test time: 15.71s
>>> seed: 3407
>>>>>>>>>> dataset: Kvasir-SEG, size: 30, test time: 27.54s
>>>>  results_0.9_100_100_0.99_0.01_0.33_0.33_0.33_1.00_True_1.00_15.67_0.63/VOC2012
>>>> thres: 0.1, dice: 0.5826, iou: 0.4700, pixel_acc: 0.7845
>>>> thres: 0.2, dice: 0.5881, iou: 0.4757, pixel_acc: 0.8080
>>>> thres: 0.3, dice: 0.5841, iou: 0.4713, pixel_acc: 0.8164
>>>> thres: 0.4, dice: 0.5760, iou: 0.4621, pixel_acc: 0.8212
>>>> thres: 0.5, dice: 0.5647, iou: 0.4493, pixel_acc: 0.8243
>>>> thres: 0.6, dice: 0.5480, iou: 0.4311, pixel_acc: 0.8254
>>>> thres: 0.7, dice: 0.5252, iou: 0.4073, pixel_acc: 0.8256
>>>> thres: 0.8, dice: 0.4901, iou: 0.3724, pixel_acc: 0.8241
>>>> thres: 0.9, dice: 0.4246, iou: 0.3100, pixel_acc: 0.8202
>>> dice AUC: 0.4380, iou AUC: 0.3459, pixel_acc AUC: 0.6547
>>>>  results_

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

stabilityai/sdxl-turbo
>>> seed: 3407


  latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)


>>>>>>>>>> dataset: VOC2012, size: 200, test time: 181.75s
>>> seed: 3407


In [None]:
import shutil
import os
for dir in os.listdir('./'):
  if dir.startswith('results'):
    shutil.rmtree(dir)